pack.c 32.4 KB
Newer Older
1
/*
Edward Thomson committed
2
 * Copyright (C) the libgit2 contributors. All rights reserved.
3
 *
Vicent Marti committed
4 5
 * This file is part of libgit2, distributed under the GNU GPL v2 with
 * a Linking Exception. For full terms see the included COPYING file.
6 7
 */

8
#include "common.h"
9 10
#include "odb.h"
#include "pack.h"
11
#include "delta.h"
12
#include "sha1_lookup.h"
13 14
#include "mwindow.h"
#include "fileops.h"
15
#include "oid.h"
16

17
#include <zlib.h>
18

19 20
GIT__USE_OFFMAP
GIT__USE_OIDMAP
21

22
static int packfile_open(struct git_pack_file *p);
23
static git_off_t nth_packed_object_offset(const struct git_pack_file *p, uint32_t n);
24
static int packfile_unpack_compressed(
25 26 27
		git_rawobj *obj,
		struct git_pack_file *p,
		git_mwindow **w_curs,
28
		git_off_t *curpos,
29 30 31 32 33
		size_t size,
		git_otype type);

/* Can find the offset of an object given
 * a prefix of an identifier.
34
 * Throws GIT_EAMBIGUOUSOIDPREFIX if short oid
35 36 37 38 39
 * is ambiguous within the pack.
 * This method assumes that len is between
 * GIT_OID_MINPREFIXLEN and GIT_OID_HEXSZ.
 */
static int pack_entry_find_offset(
40
		git_off_t *offset_out,
41 42 43
		git_oid *found_oid,
		struct git_pack_file *p,
		const git_oid *short_oid,
44
		size_t len);
45

46 47 48 49 50 51
static int packfile_error(const char *message)
{
	giterr_set(GITERR_ODB, "Invalid pack file - %s", message);
	return -1;
}

52 53 54
/********************
 * Delta base cache
 ********************/
55

56
static git_pack_cache_entry *new_cache_object(git_rawobj *source)
57
{
58
	git_pack_cache_entry *e = git__calloc(1, sizeof(git_pack_cache_entry));
59 60 61
	if (!e)
		return NULL;

62
	git_atomic_inc(&e->refcount);
63 64 65 66 67 68 69 70 71 72
	memcpy(&e->raw, source, sizeof(git_rawobj));

	return e;
}

static void free_cache_object(void *o)
{
	git_pack_cache_entry *e = (git_pack_cache_entry *)o;

	if (e != NULL) {
73
		assert(e->refcount.val == 0);
74 75 76 77 78
		git__free(e->raw.data);
		git__free(e);
	}
}

79 80 81 82 83 84 85 86 87 88 89
static void cache_free(git_pack_cache *cache)
{
	khiter_t k;

	if (cache->entries) {
		for (k = kh_begin(cache->entries); k != kh_end(cache->entries); k++) {
			if (kh_exist(cache->entries, k))
				free_cache_object(kh_value(cache->entries, k));
		}

		git_offmap_free(cache->entries);
90
		cache->entries = NULL;
91 92 93 94 95 96 97
	}
}

static int cache_init(git_pack_cache *cache)
{
	cache->entries = git_offmap_alloc();
	GITERR_CHECK_ALLOC(cache->entries);
98

99
	cache->memory_limit = GIT_PACK_CACHE_MEMORY_LIMIT;
Russell Belfer committed
100 101 102 103 104 105 106 107 108

	if (git_mutex_init(&cache->lock)) {
		giterr_set(GITERR_OS, "Failed to initialize pack cache mutex");

		git__free(cache->entries);
		cache->entries = NULL;

		return -1;
	}
109 110 111 112

	return 0;
}

113
static git_pack_cache_entry *cache_get(git_pack_cache *cache, git_off_t offset)
114 115 116 117
{
	khiter_t k;
	git_pack_cache_entry *entry = NULL;

118 119 120
	if (git_mutex_lock(&cache->lock) < 0)
		return NULL;

121 122 123 124
	k = kh_get(off, cache->entries, offset);
	if (k != kh_end(cache->entries)) { /* found it */
		entry = kh_value(cache->entries, k);
		git_atomic_inc(&entry->refcount);
125
		entry->last_usage = cache->use_ctr++;
126 127 128 129 130 131
	}
	git_mutex_unlock(&cache->lock);

	return entry;
}

132 133 134
/* Run with the cache lock held */
static void free_lowest_entry(git_pack_cache *cache)
{
135 136 137
	git_pack_cache_entry *entry;
	khiter_t k;

138 139 140 141 142
	for (k = kh_begin(cache->entries); k != kh_end(cache->entries); k++) {
		if (!kh_exist(cache->entries, k))
			continue;

		entry = kh_value(cache->entries, k);
143

144 145 146 147
		if (entry && entry->refcount.val == 0) {
			cache->memory_used -= entry->raw.len;
			kh_del(off, cache->entries, k);
			free_cache_object(entry);
148 149
		}
	}
150 151
}

152 153 154 155 156
static int cache_add(
		git_pack_cache_entry **cached_out,
		git_pack_cache *cache,
		git_rawobj *base,
		git_off_t offset)
157 158 159 160 161
{
	git_pack_cache_entry *entry;
	int error, exists = 0;
	khiter_t k;

162 163 164
	if (base->len > GIT_PACK_CACHE_SIZE_LIMIT)
		return -1;

165 166
	entry = new_cache_object(base);
	if (entry) {
167 168
		if (git_mutex_lock(&cache->lock) < 0) {
			giterr_set(GITERR_OS, "failed to lock cache");
Jacques Germishuys committed
169
			git__free(entry);
170 171
			return -1;
		}
172 173 174
		/* Add it to the cache if nobody else has */
		exists = kh_get(off, cache->entries, offset) != kh_end(cache->entries);
		if (!exists) {
175 176 177
			while (cache->memory_used + base->len > cache->memory_limit)
				free_lowest_entry(cache);

178 179 180
			k = kh_put(off, cache->entries, offset, &error);
			assert(error != 0);
			kh_value(cache->entries, k) = entry;
181
			cache->memory_used += entry->raw.len;
182 183

			*cached_out = entry;
184 185 186 187 188 189 190 191 192 193 194 195
		}
		git_mutex_unlock(&cache->lock);
		/* Somebody beat us to adding it into the cache */
		if (exists) {
			git__free(entry);
			return -1;
		}
	}

	return 0;
}

196 197 198 199 200 201 202 203
/***********************************************************
 *
 * PACK INDEX METHODS
 *
 ***********************************************************/

static void pack_index_free(struct git_pack_file *p)
{
204 205 206 207
	if (p->oids) {
		git__free(p->oids);
		p->oids = NULL;
	}
208 209 210 211 212 213
	if (p->index_map.data) {
		git_futils_mmap_free(&p->index_map);
		p->index_map.data = NULL;
	}
}

Vicent Marti committed
214
static int pack_index_check(const char *path, struct git_pack_file *p)
215 216 217 218 219 220 221
{
	struct git_pack_idx_header *hdr;
	uint32_t version, nr, i, *index;
	void *idx_map;
	size_t idx_size;
	struct stat st;
	int error;
222 223
	/* TODO: properly open the file without access time using O_NOATIME */
	git_file fd = git_futils_open_ro(path);
224
	if (fd < 0)
225
		return fd;
226

227 228 229 230 231 232 233
	if (p_fstat(fd, &st) < 0) {
		p_close(fd);
		giterr_set(GITERR_OS, "Unable to stat pack index '%s'", path);
		return -1;
	}

	if (!S_ISREG(st.st_mode) ||
234 235 236
		!git__is_sizet(st.st_size) ||
		(idx_size = (size_t)st.st_size) < 4 * 256 + 20 + 20)
	{
237
		p_close(fd);
238
		giterr_set(GITERR_ODB, "Invalid pack index '%s'", path);
239
		return -1;
240 241 242
	}

	error = git_futils_mmap_ro(&p->index_map, fd, 0, idx_size);
243

244 245
	p_close(fd);

246 247
	if (error < 0)
		return error;
248 249 250 251 252 253 254 255

	hdr = idx_map = p->index_map.data;

	if (hdr->idx_signature == htonl(PACK_IDX_SIGNATURE)) {
		version = ntohl(hdr->idx_version);

		if (version < 2 || version > 2) {
			git_futils_mmap_free(&p->index_map);
256
			return packfile_error("unsupported index version");
257 258 259 260 261 262 263 264 265
		}

	} else
		version = 1;

	nr = 0;
	index = idx_map;

	if (version > 1)
Vicent Marti committed
266
		index += 2; /* skip index header */
267 268 269 270 271

	for (i = 0; i < 256; i++) {
		uint32_t n = ntohl(index[i]);
		if (n < nr) {
			git_futils_mmap_free(&p->index_map);
272
			return packfile_error("index is non-monotonic");
273 274 275 276 277 278 279
		}
		nr = n;
	}

	if (version == 1) {
		/*
		 * Total size:
Vicent Marti committed
280 281 282 283
		 * - 256 index entries 4 bytes each
		 * - 24-byte entries * nr (20-byte sha1 + 4-byte offset)
		 * - 20-byte SHA1 of the packfile
		 * - 20-byte SHA1 file checksum
284 285 286
		 */
		if (idx_size != 4*256 + nr * 24 + 20 + 20) {
			git_futils_mmap_free(&p->index_map);
287
			return packfile_error("index is corrupted");
288 289 290 291
		}
	} else if (version == 2) {
		/*
		 * Minimum size:
Vicent Marti committed
292 293 294 295 296 297 298
		 * - 8 bytes of header
		 * - 256 index entries 4 bytes each
		 * - 20-byte sha1 entry * nr
		 * - 4-byte crc entry * nr
		 * - 4-byte offset entry * nr
		 * - 20-byte SHA1 of the packfile
		 * - 20-byte SHA1 file checksum
299 300 301 302 303 304 305 306 307 308 309 310
		 * And after the 4-byte offset table might be a
		 * variable sized table containing 8-byte entries
		 * for offsets larger than 2^31.
		 */
		unsigned long min_size = 8 + 4*256 + nr*(20 + 4 + 4) + 20 + 20;
		unsigned long max_size = min_size;

		if (nr)
			max_size += (nr - 1)*8;

		if (idx_size < min_size || idx_size > max_size) {
			git_futils_mmap_free(&p->index_map);
311
			return packfile_error("wrong index size");
312 313 314 315
		}
	}

	p->num_objects = nr;
316
	p->index_version = version;
317
	return 0;
318 319 320 321
}

static int pack_index_open(struct git_pack_file *p)
{
322
	int error = 0;
323 324
	size_t name_len;
	git_buf idx_name = GIT_BUF_INIT;
325

326
	if (p->index_version > -1)
Russell Belfer committed
327
		return 0;
328

329 330
	name_len = strlen(p->pack_name);
	assert(name_len > strlen(".pack")); /* checked by git_pack_file alloc */
331

332 333 334 335 336
	git_buf_grow(&idx_name, name_len);
	git_buf_put(&idx_name, p->pack_name, name_len - strlen(".pack"));
	git_buf_puts(&idx_name, ".idx");
	if (git_buf_oom(&idx_name)) {
		giterr_set_oom();
Russell Belfer committed
337
		return -1;
338
	}
339

340
	if ((error = git_mutex_lock(&p->lock)) < 0) {
341
		git_buf_free(&idx_name);
Russell Belfer committed
342
		return error;
343
	}
Russell Belfer committed
344

345
	if (p->index_version == -1)
346
		error = pack_index_check(idx_name.ptr, p);
347

348
	git_buf_free(&idx_name);
349

350 351
	git_mutex_unlock(&p->lock);

352
	return error;
353 354 355 356
}

static unsigned char *pack_window_open(
		struct git_pack_file *p,
357
		git_mwindow **w_cursor,
358
		git_off_t offset,
359 360
		unsigned int *left)
{
361
	if (p->mwf.fd == -1 && packfile_open(p) < 0)
362 363 364 365 366 367
		return NULL;

	/* Since packfiles end in a hash of their content and it's
	 * pointless to ask for an offset into the middle of that
	 * hash, and the pack_window_contains function above wouldn't match
	 * don't allow an offset too close to the end of the file.
368 369 370
	 *
	 * Don't allow a negative offset, as that means we've wrapped
	 * around.
371 372 373
	 */
	if (offset > (p->mwf.size - 20))
		return NULL;
374 375
	if (offset < 0)
		return NULL;
376 377 378 379

	return git_mwindow_open(&p->mwf, w_cursor, offset, 20, left);
 }

380 381 382 383 384 385 386 387
/*
 * The per-object header is a pretty dense thing, which is
 *  - first byte: low four bits are "size",
 *    then three bits of "type",
 *    with the high bit being "size continues".
 *  - each byte afterwards: low seven bits are size continuation,
 *    with the high bit being "size continues"
 */
388
size_t git_packfile__object_header(unsigned char *hdr, size_t size, git_otype type)
389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407
{
	unsigned char *hdr_base;
	unsigned char c;

	assert(type >= GIT_OBJ_COMMIT && type <= GIT_OBJ_REF_DELTA);

	/* TODO: add support for chunked objects; see git.git 6c0d19b1 */

	c = (unsigned char)((type << 4) | (size & 15));
	size >>= 4;
	hdr_base = hdr;

	while (size) {
		*hdr++ = c | 0x80;
		c = size & 0x7f;
		size >>= 7;
	}
	*hdr++ = c;

408
	return (hdr - hdr_base);
409 410 411
}


412 413
static int packfile_unpack_header1(
		unsigned long *usedp,
414 415 416 417 418 419 420 421
		size_t *sizep,
		git_otype *type,
		const unsigned char *buf,
		unsigned long len)
{
	unsigned shift;
	unsigned long size, c;
	unsigned long used = 0;
422

423 424 425 426 427
	c = buf[used++];
	*type = (c >> 4) & 7;
	size = c & 15;
	shift = 4;
	while (c & 0x80) {
428 429
		if (len <= used) {
			giterr_set(GITERR_ODB, "buffer too small");
430
			return GIT_EBUFS;
431
		}
432 433 434

		if (bitsizeof(long) <= shift) {
			*usedp = 0;
435
			giterr_set(GITERR_ODB, "packfile corrupted");
436 437
			return -1;
		}
438 439 440 441 442 443 444

		c = buf[used++];
		size += (c & 0x7f) << shift;
		shift += 7;
	}

	*sizep = (size_t)size;
445 446
	*usedp = used;
	return 0;
447 448 449 450 451 452 453
}

int git_packfile_unpack_header(
		size_t *size_p,
		git_otype *type_p,
		git_mwindow_file *mwf,
		git_mwindow **w_curs,
454
		git_off_t *curpos)
455 456 457 458
{
	unsigned char *base;
	unsigned int left;
	unsigned long used;
459
	int ret;
460 461

	/* pack_window_open() assures us we have [base, base + 20) available
Vicent Marti committed
462 463
	 * as a range that we can look at at. (Its actually the hash
	 * size that is assured.) With our object header encoding
464 465 466
	 * the maximum deflated object size is 2^137, which is just
	 * insane, so we know won't exceed what we have been given.
	 */
467
/*	base = pack_window_open(p, w_curs, *curpos, &left); */
468 469
	base = git_mwindow_open(mwf, w_curs, *curpos, 20, &left);
	if (base == NULL)
470
		return GIT_EBUFS;
471

472
	ret = packfile_unpack_header1(&used, size_p, type_p, base, left);
473
	git_mwindow_close(w_curs);
474
	if (ret == GIT_EBUFS)
475 476
		return ret;
	else if (ret < 0)
477
		return packfile_error("header length is zero");
478 479

	*curpos += used;
480
	return 0;
481 482
}

483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501
int git_packfile_resolve_header(
		size_t *size_p,
		git_otype *type_p,
		struct git_pack_file *p,
		git_off_t offset)
{
	git_mwindow *w_curs = NULL;
	git_off_t curpos = offset;
	size_t size;
	git_otype type;
	git_off_t base_offset;
	int error;

	error = git_packfile_unpack_header(&size, &type, &p->mwf, &w_curs, &curpos);
	if (error < 0)
		return error;

	if (type == GIT_OBJ_OFS_DELTA || type == GIT_OBJ_REF_DELTA) {
		size_t base_size;
502 503
		git_packfile_stream stream;

504 505
		base_offset = get_delta_base(p, &w_curs, &curpos, type, offset);
		git_mwindow_close(&w_curs);
506
		if ((error = git_packfile_stream_open(&stream, p, curpos)) < 0)
507
			return error;
508
		error = git_delta_read_header_fromstream(&base_size, size_p, &stream);
509
		git_packfile_stream_free(&stream);
510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529
		if (error < 0)
			return error;
	} else
		*size_p = size;

	while (type == GIT_OBJ_OFS_DELTA || type == GIT_OBJ_REF_DELTA) {
		curpos = base_offset;
		error = git_packfile_unpack_header(&size, &type, &p->mwf, &w_curs, &curpos);
		if (error < 0)
			return error;
		if (type != GIT_OBJ_OFS_DELTA && type != GIT_OBJ_REF_DELTA)
			break;
		base_offset = get_delta_base(p, &w_curs, &curpos, type, base_offset);
		git_mwindow_close(&w_curs);
	}
	*type_p = type;

	return error;
}

530 531
#define SMALL_STACK_SIZE 64

532 533 534 535 536 537
/**
 * Generate the chain of dependencies which we need to get to the
 * object at `off`. `chain` is used a stack, popping gives the right
 * order to apply deltas on. If an object is found in the pack's base
 * cache, we stop calculating there.
 */
538 539 540 541
static int pack_dependency_chain(git_dependency_chain *chain_out,
				 git_pack_cache_entry **cached_out, git_off_t *cached_off,
				 struct pack_chain_elem *small_stack, size_t *stack_sz,
				 struct git_pack_file *p, git_off_t obj_offset)
542 543 544 545
{
	git_dependency_chain chain = GIT_ARRAY_INIT;
	git_mwindow *w_curs = NULL;
	git_off_t curpos = obj_offset, base_offset;
546 547
	int error = 0, use_heap = 0;
	size_t size, elem_pos;
548 549
	git_otype type;

550
	elem_pos = 0;
551 552 553 554 555 556 557 558 559 560 561
	while (true) {
		struct pack_chain_elem *elem;
		git_pack_cache_entry *cached = NULL;

		/* if we have a base cached, we can stop here instead */
		if ((cached = cache_get(&p->bases, obj_offset)) != NULL) {
			*cached_out = cached;
			*cached_off = obj_offset;
			break;
		}

562 563 564 565 566 567 568 569 570
		/* if we run out of space on the small stack, use the array */
		if (elem_pos == SMALL_STACK_SIZE) {
			git_array_init_to_size(chain, elem_pos);
			GITERR_CHECK_ARRAY(chain);
			memcpy(chain.ptr, small_stack, elem_pos * sizeof(struct pack_chain_elem));
			chain.size = elem_pos;
			use_heap = 1;
		}

571
		curpos = obj_offset;
572 573 574 575 576 577 578 579
		if (!use_heap) {
			elem = &small_stack[elem_pos];
		} else {
			elem = git_array_alloc(chain);
			if (!elem) {
				error = -1;
				goto on_error;
			}
580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613
		}

		elem->base_key = obj_offset;

		error = git_packfile_unpack_header(&size, &type, &p->mwf, &w_curs, &curpos);

		if (error < 0)
			goto on_error;

		elem->offset = curpos;
		elem->size = size;
		elem->type = type;
		elem->base_key = obj_offset;

		if (type != GIT_OBJ_OFS_DELTA && type != GIT_OBJ_REF_DELTA)
			break;

		base_offset = get_delta_base(p, &w_curs, &curpos, type, obj_offset);
		git_mwindow_close(&w_curs);

		if (base_offset == 0) {
			error = packfile_error("delta offset is zero");
			goto on_error;
		}
		if (base_offset < 0) { /* must actually be an error code */
			error = (int)base_offset;
			goto on_error;
		}

		/* we need to pass the pos *after* the delta-base bit */
		elem->offset = curpos;

		/* go through the loop again, but with the new object */
		obj_offset = base_offset;
614
		elem_pos++;
615 616
	}

617 618
	
	*stack_sz = elem_pos + 1;
619 620 621 622 623 624 625 626
	*chain_out = chain;
	return error;

on_error:
	git_array_clear(chain);
	return error;
}

627
int git_packfile_unpack(
628 629 630
	git_rawobj *obj,
	struct git_pack_file *p,
	git_off_t *obj_offset)
631 632
{
	git_mwindow *w_curs = NULL;
633
	git_off_t curpos = *obj_offset;
634 635
	int error, free_base = 0;
	git_dependency_chain chain = GIT_ARRAY_INIT;
636
	struct pack_chain_elem *elem = NULL, *stack;
637
	git_pack_cache_entry *cached = NULL;
638
	struct pack_chain_elem small_stack[SMALL_STACK_SIZE];
639
	size_t stack_size = 0, elem_pos, alloclen;
640
	git_otype base_type;
641 642 643 644 645

	/*
	 * TODO: optionally check the CRC on the packfile
	 */

646
	error = pack_dependency_chain(&chain, &cached, obj_offset, small_stack, &stack_size, p, *obj_offset);
647 648 649
	if (error < 0)
		return error;

650 651 652 653
	obj->data = NULL;
	obj->len = 0;
	obj->type = GIT_OBJ_BAD;

654 655 656 657
	/* let's point to the right stack */
	stack = chain.ptr ? chain.ptr : small_stack;

	elem_pos = stack_size;
658
	if (cached) {
659
		memcpy(obj, &cached->raw, sizeof(git_rawobj));
660
		base_type = obj->type;
661
		elem_pos--;	/* stack_size includes the base, which isn't actually there */
662
	} else {
663
		elem = &stack[--elem_pos];
664
		base_type = elem->type;
665
	}
666

667 668 669 670 671
	switch (base_type) {
	case GIT_OBJ_COMMIT:
	case GIT_OBJ_TREE:
	case GIT_OBJ_BLOB:
	case GIT_OBJ_TAG:
672
		if (!cached) {
673 674 675
			curpos = elem->offset;
			error = packfile_unpack_compressed(obj, p, &w_curs, &curpos, elem->size, elem->type);
			git_mwindow_close(&w_curs);
676
			base_type = elem->type;
677 678 679 680 681 682 683 684 685 686 687 688 689
		}
		if (error < 0)
			goto cleanup;
		break;
	case GIT_OBJ_OFS_DELTA:
	case GIT_OBJ_REF_DELTA:
		error = packfile_error("dependency chain ends in a delta");
		goto cleanup;
	default:
		error = packfile_error("invalid packfile type in header");
		goto cleanup;
	}

690
	/*
691
	 * Finding the object we want a cached base element is
692 693 694 695
	 * problematic, as we need to make sure we don't accidentally
	 * give the caller the cached object, which it would then feel
	 * free to free, so we need to copy the data.
	 */
696
	if (cached && stack_size == 1) {
697
		void *data = obj->data;
698

699 700
		GITERR_CHECK_ALLOC_ADD(&alloclen, obj->len, 1);
		obj->data = git__malloc(alloclen);
701
		GITERR_CHECK_ALLOC(obj->data);
702

703 704 705 706 707
		memcpy(obj->data, data, obj->len + 1);
		git_atomic_dec(&cached->refcount);
		goto cleanup;
	}

708
	/* we now apply each consecutive delta until we run out */
709
	while (elem_pos > 0 && !error) {
710 711
		git_rawobj base, delta;

712 713 714 715 716
		/*
		 * We can now try to add the base to the cache, as
		 * long as it's not already the cached one.
		 */
		if (!cached)
717
			free_base = !!cache_add(&cached, &p->bases, obj, elem->base_key);
718

719
		elem = &stack[elem_pos - 1];
720 721 722 723 724 725 726 727 728 729 730 731 732
		curpos = elem->offset;
		error = packfile_unpack_compressed(&delta, p, &w_curs, &curpos, elem->size, elem->type);
		git_mwindow_close(&w_curs);

		if (error < 0)
			break;

		/* the current object becomes the new base, on which we apply the delta */
		base = *obj;
		obj->data = NULL;
		obj->len = 0;
		obj->type = GIT_OBJ_BAD;

733
		error = git_delta_apply(&obj->data, &obj->len, base.data, base.len, delta.data, delta.len);
734
		obj->type = base_type;
735

736 737 738 739 740 741
		/*
		 * We usually don't want to free the base at this
		 * point, as we put it into the cache in the previous
		 * iteration. free_base lets us know that we got the
		 * base object directly from the packfile, so we can free it.
		 */
742
		git__free(delta.data);
743 744 745 746 747 748 749 750 751
		if (free_base) {
			free_base = 0;
			git__free(base.data);
		}

		if (cached) {
			git_atomic_dec(&cached->refcount);
			cached = NULL;
		}
752 753 754

		if (error < 0)
			break;
755

756
		elem_pos--;
757 758
	}

759
cleanup:
760 761 762
	if (error < 0)
		git__free(obj->data);

763
	if (elem)
764
		*obj_offset = curpos;
765

766
	git_array_clear(chain);
767
	return error;
768 769
}

Russell Belfer committed
770 771 772 773 774 775 776 777 778 779 780 781
static void *use_git_alloc(void *opaq, unsigned int count, unsigned int size)
{
	GIT_UNUSED(opaq);
	return git__calloc(count, size);
}

static void use_git_free(void *opaq, void *ptr)
{
	GIT_UNUSED(opaq);
	git__free(ptr);
}

782 783 784 785 786 787 788 789 790 791 792 793 794
int git_packfile_stream_open(git_packfile_stream *obj, struct git_pack_file *p, git_off_t curpos)
{
	int st;

	memset(obj, 0, sizeof(git_packfile_stream));
	obj->curpos = curpos;
	obj->p = p;
	obj->zstream.zalloc = use_git_alloc;
	obj->zstream.zfree = use_git_free;
	obj->zstream.next_in = Z_NULL;
	obj->zstream.next_out = Z_NULL;
	st = inflateInit(&obj->zstream);
	if (st != Z_OK) {
795
		giterr_set(GITERR_ZLIB, "failed to init packfile stream");
796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815
		return -1;
	}

	return 0;
}

ssize_t git_packfile_stream_read(git_packfile_stream *obj, void *buffer, size_t len)
{
	unsigned char *in;
	size_t written;
	int st;

	if (obj->done)
		return 0;

	in = pack_window_open(obj->p, &obj->mw, obj->curpos, &obj->zstream.avail_in);
	if (in == NULL)
		return GIT_EBUFS;

	obj->zstream.next_out = buffer;
816
	obj->zstream.avail_out = (unsigned int)len;
817 818 819 820 821 822 823 824 825
	obj->zstream.next_in = in;

	st = inflate(&obj->zstream, Z_SYNC_FLUSH);
	git_mwindow_close(&obj->mw);

	obj->curpos += obj->zstream.next_in - in;
	written = len - obj->zstream.avail_out;

	if (st != Z_OK && st != Z_STREAM_END) {
826
		giterr_set(GITERR_ZLIB, "error reading from the zlib stream");
827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846
		return -1;
	}

	if (st == Z_STREAM_END)
		obj->done = 1;


	/* If we didn't write anything out but we're not done, we need more data */
	if (!written && st != Z_STREAM_END)
		return GIT_EBUFS;

	return written;

}

void git_packfile_stream_free(git_packfile_stream *obj)
{
	inflateEnd(&obj->zstream);
}

847
static int packfile_unpack_compressed(
848 849 850 851 852 853
	git_rawobj *obj,
	struct git_pack_file *p,
	git_mwindow **w_curs,
	git_off_t *curpos,
	size_t size,
	git_otype type)
854
{
855
	size_t buf_size;
856 857 858 859
	int st;
	z_stream stream;
	unsigned char *buffer, *in;

860 861
	GITERR_CHECK_ALLOC_ADD(&buf_size, size, 1);
	buffer = git__calloc(1, buf_size);
862
	GITERR_CHECK_ALLOC(buffer);
863 864 865

	memset(&stream, 0, sizeof(stream));
	stream.next_out = buffer;
866
	stream.avail_out = (uInt)buf_size;
Russell Belfer committed
867 868
	stream.zalloc = use_git_alloc;
	stream.zfree = use_git_free;
869 870 871

	st = inflateInit(&stream);
	if (st != Z_OK) {
872
		git__free(buffer);
873
		giterr_set(GITERR_ZLIB, "failed to init zlib stream on unpack");
874

875
		return -1;
876 877 878
	}

	do {
879
		in = pack_window_open(p, w_curs, *curpos, &stream.avail_in);
880 881
		stream.next_in = in;
		st = inflate(&stream, Z_FINISH);
882
		git_mwindow_close(w_curs);
883 884 885 886

		if (!stream.avail_out)
			break; /* the payload is larger than it should be */

887 888 889
		if (st == Z_BUF_ERROR && in == NULL) {
			inflateEnd(&stream);
			git__free(buffer);
890
			return GIT_EBUFS;
891 892
		}

893
		*curpos += stream.next_in - in;
894 895 896 897 898
	} while (st == Z_OK || st == Z_BUF_ERROR);

	inflateEnd(&stream);

	if ((st != Z_STREAM_END) || stream.total_out != size) {
899
		git__free(buffer);
900
		giterr_set(GITERR_ZLIB, "error inflating zlib stream");
901
		return -1;
902 903 904 905 906
	}

	obj->type = type;
	obj->len = size;
	obj->data = buffer;
907
	return 0;
908 909
}

910 911 912 913
/*
 * curpos is where the data starts, delta_obj_offset is the where the
 * header starts
 */
914 915 916 917 918 919
git_off_t get_delta_base(
	struct git_pack_file *p,
	git_mwindow **w_curs,
	git_off_t *curpos,
	git_otype type,
	git_off_t delta_obj_offset)
920
{
921 922
	unsigned int left = 0;
	unsigned char *base_info;
923
	git_off_t base_offset;
924 925
	git_oid unused;

926 927 928
	base_info = pack_window_open(p, w_curs, *curpos, &left);
	/* Assumption: the only reason this would fail is because the file is too small */
	if (base_info == NULL)
929
		return GIT_EBUFS;
930 931
	/* pack_window_open() assured us we have [base_info, base_info + 20)
	 * as a range that we can look at without walking off the
Vicent Marti committed
932 933
	 * end of the mapped window. Its actually the hash size
	 * that is assured. An OFS_DELTA longer than the hash size
934 935 936 937 938 939 940
	 * is stupid, as then a REF_DELTA would be smaller to store.
	 */
	if (type == GIT_OBJ_OFS_DELTA) {
		unsigned used = 0;
		unsigned char c = base_info[used++];
		base_offset = c & 127;
		while (c & 128) {
941
			if (left <= used)
942
				return GIT_EBUFS;
943 944
			base_offset += 1;
			if (!base_offset || MSB(base_offset, 7))
Vicent Marti committed
945
				return 0; /* overflow */
946 947 948 949 950
			c = base_info[used++];
			base_offset = (base_offset << 7) + (c & 127);
		}
		base_offset = delta_obj_offset - base_offset;
		if (base_offset <= 0 || base_offset >= delta_obj_offset)
Vicent Marti committed
951
			return 0; /* out of bound */
952 953
		*curpos += used;
	} else if (type == GIT_OBJ_REF_DELTA) {
954 955
		/* If we have the cooperative cache, search in it first */
		if (p->has_cache) {
956 957
			khiter_t k;
			git_oid oid;
958

959 960 961
			git_oid_fromraw(&oid, base_info);
			k = kh_get(oid, p->idx_cache, &oid);
			if (k != kh_end(p->idx_cache)) {
962
				*curpos += 20;
963
				return ((struct git_pack_entry *)kh_value(p->idx_cache, k))->offset;
964 965 966 967 968 969
			} else {
				/* If we're building an index, don't try to find the pack
				 * entry; we just haven't seen it yet.  We'll make
				 * progress again in the next loop.
				 */
				return GIT_PASSTHROUGH;
970 971
			}
		}
972

973
		/* The base entry _must_ be in the same pack */
974 975
		if (pack_entry_find_offset(&base_offset, &unused, p, (git_oid *)base_info, GIT_OID_HEXSZ) < 0)
			return packfile_error("base entry delta is not in the same pack");
976 977 978 979 980 981
		*curpos += 20;
	} else
		return 0;

	return base_offset;
}
982 983 984 985 986 987 988

/***********************************************************
 *
 * PACKFILE METHODS
 *
 ***********************************************************/

989
void git_packfile_free(struct git_pack_file *p)
990
{
991 992 993
	if (!p)
		return;

994
	cache_free(&p->bases);
995

996 997
	if (p->mwf.fd >= 0) {
		git_mwindow_free_all_locked(&p->mwf);
998
		p_close(p->mwf.fd);
999
	}
1000 1001 1002

	pack_index_free(p);

1003
	git__free(p->bad_object_sha1);
1004 1005

	git_mutex_free(&p->lock);
1006
	git_mutex_free(&p->bases.lock);
1007
	git__free(p);
1008 1009 1010 1011 1012 1013 1014 1015 1016
}

static int packfile_open(struct git_pack_file *p)
{
	struct stat st;
	struct git_pack_header hdr;
	git_oid sha1;
	unsigned char *idx_sha1;

1017
	if (p->index_version == -1 && pack_index_open(p) < 0)
1018
		return git_odb__error_notfound("failed to open packfile", NULL, 0);
1019

1020 1021 1022 1023 1024 1025 1026 1027 1028
	/* if mwf opened by another thread, return now */
	if (git_mutex_lock(&p->lock) < 0)
		return packfile_error("failed to get lock for open");

	if (p->mwf.fd >= 0) {
		git_mutex_unlock(&p->lock);
		return 0;
	}

1029
	/* TODO: open with noatime */
1030
	p->mwf.fd = git_futils_open_ro(p->pack_name);
1031 1032
	if (p->mwf.fd < 0)
		goto cleanup;
1033

1034 1035 1036
	if (p_fstat(p->mwf.fd, &st) < 0 ||
		git_mwindow_file_register(&p->mwf) < 0)
		goto cleanup;
1037 1038 1039 1040 1041

	/* If we created the struct before we had the pack we lack size. */
	if (!p->mwf.size) {
		if (!S_ISREG(st.st_mode))
			goto cleanup;
1042
		p->mwf.size = (git_off_t)st.st_size;
1043 1044 1045 1046 1047 1048 1049 1050 1051
	} else if (p->mwf.size != st.st_size)
		goto cleanup;

#if 0
	/* We leave these file descriptors open with sliding mmap;
	 * there is no point keeping them open across exec(), though.
	 */
	fd_flag = fcntl(p->mwf.fd, F_GETFD, 0);
	if (fd_flag < 0)
1052
		goto cleanup;
1053 1054 1055

	fd_flag |= FD_CLOEXEC;
	if (fcntl(p->pack_fd, F_SETFD, fd_flag) == -1)
1056
		goto cleanup;
1057 1058 1059
#endif

	/* Verify we recognize this pack file format. */
1060 1061 1062
	if (p_read(p->mwf.fd, &hdr, sizeof(hdr)) < 0 ||
		hdr.hdr_signature != htonl(PACK_SIGNATURE) ||
		!pack_version_ok(hdr.hdr_version))
1063 1064 1065
		goto cleanup;

	/* Verify the pack matches its index. */
1066 1067 1068
	if (p->num_objects != ntohl(hdr.hdr_entries) ||
		p_lseek(p->mwf.fd, p->mwf.size - GIT_OID_RAWSZ, SEEK_SET) == -1 ||
		p_read(p->mwf.fd, sha1.id, GIT_OID_RAWSZ) < 0)
1069 1070 1071 1072
		goto cleanup;

	idx_sha1 = ((unsigned char *)p->index_map.data) + p->index_map.len - 40;

1073 1074 1075 1076 1077
	if (git_oid__cmp(&sha1, (git_oid *)idx_sha1) != 0)
		goto cleanup;

	git_mutex_unlock(&p->lock);
	return 0;
1078 1079

cleanup:
1080
	giterr_set(GITERR_OS, "Invalid packfile '%s'", p->pack_name);
1081

1082 1083
	if (p->mwf.fd >= 0)
		p_close(p->mwf.fd);
1084
	p->mwf.fd = -1;
1085 1086 1087

	git_mutex_unlock(&p->lock);

1088
	return -1;
1089 1090
}

1091 1092 1093 1094 1095 1096 1097 1098
int git_packfile__name(char **out, const char *path)
{
	size_t path_len;
	git_buf buf = GIT_BUF_INIT;

	path_len = strlen(path);

	if (path_len < strlen(".idx"))
1099
		return git_odb__error_notfound("invalid packfile path", NULL, 0);
1100 1101 1102 1103 1104 1105 1106 1107

	if (git_buf_printf(&buf, "%.*s.pack", (int)(path_len - strlen(".idx")), path) < 0)
		return -1;

	*out = git_buf_detach(&buf);
	return 0;
}

1108
int git_packfile_alloc(struct git_pack_file **pack_out, const char *path)
1109 1110 1111
{
	struct stat st;
	struct git_pack_file *p;
1112
	size_t path_len = path ? strlen(path) : 0, alloc_len;
1113 1114

	*pack_out = NULL;
1115

1116
	if (path_len < strlen(".idx"))
1117
		return git_odb__error_notfound("invalid packfile path", NULL, 0);
1118

1119 1120
	GITERR_CHECK_ALLOC_ADD(&alloc_len, sizeof(*p), path_len);
	GITERR_CHECK_ALLOC_ADD(&alloc_len, alloc_len, 2);
1121

1122
	p = git__calloc(1, alloc_len);
1123
	GITERR_CHECK_ALLOC(p);
1124

1125 1126
	memcpy(p->pack_name, path, path_len + 1);

1127 1128 1129 1130
	/*
	 * Make sure a corresponding .pack file exists and that
	 * the index looks sane.
	 */
1131 1132 1133 1134 1135 1136
	if (git__suffixcmp(path, ".idx") == 0) {
		size_t root_len = path_len - strlen(".idx");

		memcpy(p->pack_name + root_len, ".keep", sizeof(".keep"));
		if (git_path_exists(p->pack_name) == true)
			p->pack_keep = 1;
1137

1138 1139
		memcpy(p->pack_name + root_len, ".pack", sizeof(".pack"));
	}
1140

1141
	if (p_stat(p->pack_name, &st) < 0 || !S_ISREG(st.st_mode)) {
1142
		git__free(p);
1143
		return git_odb__error_notfound("packfile not found", NULL, 0);
1144 1145 1146 1147 1148
	}

	/* ok, it looks sane as far as we can check without
	 * actually mapping the pack file.
	 */
1149
	p->mwf.fd = -1;
1150
	p->mwf.size = st.st_size;
1151 1152
	p->pack_local = 1;
	p->mtime = (git_time_t)st.st_mtime;
1153
	p->index_version = -1;
1154

Russell Belfer committed
1155 1156 1157 1158 1159
	if (git_mutex_init(&p->lock)) {
		giterr_set(GITERR_OS, "Failed to initialize packfile mutex");
		git__free(p);
		return -1;
	}
1160

1161 1162 1163 1164 1165
	if (cache_init(&p->bases) < 0) {
		git__free(p);
		return -1;
	}

1166
	*pack_out = p;
1167 1168

	return 0;
1169 1170 1171 1172 1173 1174 1175 1176
}

/***********************************************************
 *
 * PACKFILE ENTRY SEARCH INTERNALS
 *
 ***********************************************************/

1177
static git_off_t nth_packed_object_offset(const struct git_pack_file *p, uint32_t n)
1178 1179
{
	const unsigned char *index = p->index_map.data;
1180
	const unsigned char *end = index + p->index_map.len;
1181 1182 1183 1184 1185 1186 1187 1188 1189 1190
	index += 4 * 256;
	if (p->index_version == 1) {
		return ntohl(*((uint32_t *)(index + 24 * n)));
	} else {
		uint32_t off;
		index += 8 + p->num_objects * (20 + 4);
		off = ntohl(*((uint32_t *)(index + 4 * n)));
		if (!(off & 0x80000000))
			return off;
		index += p->num_objects * 4 + (off & 0x7fffffff) * 8;
1191 1192 1193 1194 1195

		/* Make sure we're not being sent out of bounds */
		if (index >= end - 8)
			return -1;

1196
		return (((uint64_t)ntohl(*((uint32_t *)(index + 0)))) << 32) |
Vicent Marti committed
1197
					ntohl(*((uint32_t *)(index + 4)));
1198 1199 1200
	}
}

1201 1202 1203 1204
static int git__memcmp4(const void *a, const void *b) {
	return memcmp(a, b, 4);
}

1205
int git_pack_foreach_entry(
1206
	struct git_pack_file *p,
1207
	git_odb_foreach_cb cb,
1208
	void *data)
1209 1210 1211
{
	const unsigned char *index = p->index_map.data, *current;
	uint32_t i;
1212
	int error = 0;
1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228

	if (index == NULL) {
		if ((error = pack_index_open(p)) < 0)
			return error;

		assert(p->index_map.data);

		index = p->index_map.data;
	}

	if (p->index_version > 1) {
		index += 8;
	}

	index += 4 * 256;

1229 1230
	if (p->oids == NULL) {
		git_vector offsets, oids;
1231

1232 1233 1234 1235 1236
		if ((error = git_vector_init(&oids, p->num_objects, NULL)))
			return error;

		if ((error = git_vector_init(&offsets, p->num_objects, git__memcmp4)))
			return error;
1237

1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251
		if (p->index_version > 1) {
			const unsigned char *off = index + 24 * p->num_objects;
			for (i = 0; i < p->num_objects; i++)
				git_vector_insert(&offsets, (void*)&off[4 * i]);
			git_vector_sort(&offsets);
			git_vector_foreach(&offsets, i, current)
				git_vector_insert(&oids, (void*)&index[5 * (current - off)]);
		} else {
			for (i = 0; i < p->num_objects; i++)
				git_vector_insert(&offsets, (void*)&index[24 * i]);
			git_vector_sort(&offsets);
			git_vector_foreach(&offsets, i, current)
				git_vector_insert(&oids, (void*)&current[4]);
		}
1252

1253
		git_vector_free(&offsets);
1254
		p->oids = (git_oid **)git_vector_detach(NULL, NULL, &oids);
1255 1256
	}

1257
	for (i = 0; i < p->num_objects; i++)
1258 1259
		if ((error = cb(p->oids[i], data)) != 0)
			return giterr_set_after_callback(error);
1260

1261
	return error;
1262 1263
}

1264
static int pack_entry_find_offset(
1265 1266 1267 1268
	git_off_t *offset_out,
	git_oid *found_oid,
	struct git_pack_file *p,
	const git_oid *short_oid,
1269
	size_t len)
1270
{
1271 1272
	const uint32_t *level1_ofs;
	const unsigned char *index;
1273 1274
	unsigned hi, lo, stride;
	int pos, found = 0;
1275
	git_off_t offset;
1276 1277 1278 1279
	const unsigned char *current = 0;

	*offset_out = 0;

1280
	if (p->index_version == -1) {
1281
		int error;
1282

1283 1284 1285 1286
		if ((error = pack_index_open(p)) < 0)
			return error;
		assert(p->index_map.data);
	}
1287

1288 1289 1290
	index = p->index_map.data;
	level1_ofs = p->index_map.data;

1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311
	if (p->index_version > 1) {
		level1_ofs += 2;
		index += 8;
	}

	index += 4 * 256;
	hi = ntohl(level1_ofs[(int)short_oid->id[0]]);
	lo = ((short_oid->id[0] == 0x0) ? 0 : ntohl(level1_ofs[(int)short_oid->id[0] - 1]));

	if (p->index_version > 1) {
		stride = 20;
	} else {
		stride = 24;
		index += 4;
	}

#ifdef INDEX_DEBUG_LOOKUP
	printf("%02x%02x%02x... lo %u hi %u nr %d\n",
		short_oid->id[0], short_oid->id[1], short_oid->id[2], lo, hi, p->num_objects);
#endif

1312
#ifdef GIT_USE_LOOKUP
Vicent Marti committed
1313
	pos = sha1_entry_pos(index, stride, 0, lo, hi, p->num_objects, short_oid->id);
1314 1315 1316
#else
	pos = sha1_position(index, stride, lo, hi, short_oid->id);
#endif
1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328

	if (pos >= 0) {
		/* An object matching exactly the oid was found */
		found = 1;
		current = index + pos * stride;
	} else {
		/* No object was found */
		/* pos refers to the object with the "closest" oid to short_oid */
		pos = - 1 - pos;
		if (pos < (int)p->num_objects) {
			current = index + pos * stride;

Russell Belfer committed
1329
			if (!git_oid_ncmp(short_oid, (const git_oid *)current, len))
1330 1331 1332 1333
				found = 1;
		}
	}

1334
	if (found && len != GIT_OID_HEXSZ && pos + 1 < (int)p->num_objects) {
1335 1336 1337 1338 1339 1340 1341 1342
		/* Check for ambiguousity */
		const unsigned char *next = current + stride;

		if (!git_oid_ncmp(short_oid, (const git_oid *)next, len)) {
			found = 2;
		}
	}

1343
	if (!found)
1344
		return git_odb__error_notfound("failed to find offset for pack entry", short_oid, len);
1345 1346
	if (found > 1)
		return git_odb__error_ambiguous("found multiple offsets for pack entry");
1347

1348 1349 1350 1351 1352 1353
	if ((offset = nth_packed_object_offset(p, pos)) < 0) {
		giterr_set(GITERR_ODB, "packfile index is corrupt");
		return -1;
	}

	*offset_out = offset;
1354
	git_oid_fromraw(found_oid, current);
1355 1356

#ifdef INDEX_DEBUG_LOOKUP
1357
	{
1358 1359 1360 1361 1362
		unsigned char hex_sha1[GIT_OID_HEXSZ + 1];
		git_oid_fmt(hex_sha1, found_oid);
		hex_sha1[GIT_OID_HEXSZ] = '\0';
		printf("found lo=%d %s\n", lo, hex_sha1);
	}
1363
#endif
1364

1365
	return 0;
1366 1367 1368 1369 1370 1371
}

int git_pack_entry_find(
		struct git_pack_entry *e,
		struct git_pack_file *p,
		const git_oid *short_oid,
1372
		size_t len)
1373
{
1374
	git_off_t offset;
1375 1376 1377 1378 1379 1380 1381 1382
	git_oid found_oid;
	int error;

	assert(p);

	if (len == GIT_OID_HEXSZ && p->num_bad_objects) {
		unsigned i;
		for (i = 0; i < p->num_bad_objects; i++)
1383
			if (git_oid__cmp(short_oid, &p->bad_object_sha1[i]) == 0)
1384
				return packfile_error("bad object found in packfile");
1385 1386 1387
	}

	error = pack_entry_find_offset(&offset, &found_oid, p, short_oid, len);
1388 1389
	if (error < 0)
		return error;
1390 1391 1392 1393

	/* we found a unique entry in the index;
	 * make sure the packfile backing the index
	 * still exists on disk */
1394 1395
	if (p->mwf.fd == -1 && (error = packfile_open(p)) < 0)
		return error;
1396 1397 1398 1399 1400

	e->offset = offset;
	e->p = p;

	git_oid_cpy(&e->sha1, &found_oid);
1401
	return 0;
1402
}