indexer.c 31.1 KB
Newer Older
1
/*
Edward Thomson committed
2
 * Copyright (C) the libgit2 contributors. All rights reserved.
3
 *
Vicent Marti committed
4 5
 * This file is part of libgit2, distributed under the GNU GPL v2 with
 * a Linking Exception. For full terms see the included COPYING file.
6 7
 */

8 9
#include "indexer.h"

Carlos Martín Nieto committed
10
#include "git2/indexer.h"
11
#include "git2/object.h"
Carlos Martín Nieto committed
12

13 14 15
#include "commit.h"
#include "tree.h"
#include "tag.h"
16
#include "pack.h"
Carlos Martín Nieto committed
17
#include "mwindow.h"
18
#include "posix.h"
19 20
#include "pack.h"
#include "filebuf.h"
21
#include "oid.h"
22
#include "oidarray.h"
23
#include "oidmap.h"
24
#include "zstream.h"
25
#include "object.h"
26

27 28
extern git_mutex git__mwindow_mutex;

29
size_t git_indexer__max_objects = UINT32_MAX;
30

31
#define UINT31_MAX (0x7FFFFFFF)
32

33
struct entry {
34
	git_oid oid;
35 36 37 38 39
	uint32_t crc;
	uint32_t offset;
	uint64_t offset_long;
};

40
struct git_indexer {
41
	unsigned int parsed_header :1,
42
		pack_committed :1,
43
		have_stream :1,
44
		have_delta :1,
45 46
		do_fsync :1,
		do_verify :1;
47
	struct git_pack_header hdr;
48
	struct git_pack_file *pack;
49
	unsigned int mode;
50
	git_off_t off;
51
	git_off_t entry_start;
52
	git_object_t entry_type;
53
	git_buf entry_data;
54
	git_packfile_stream stream;
55 56 57 58
	size_t nr_objects;
	git_vector objects;
	git_vector deltas;
	unsigned int fanout[256];
59
	git_hash_ctx hash_ctx;
60
	git_oid hash;
61
	git_indexer_progress_cb progress_cb;
62
	void *progress_payload;
63
	char objbuf[8*1024];
64

65 66 67
	/* OIDs referenced from pack objects. Used for verification. */
	git_oidmap *expected_oids;

68 69 70
	/* Needed to look up objects which we want to inject to fix a thin pack */
	git_odb *odb;

71 72
	/* Fields for calculating the packfile trailer (hash of everything before it) */
	char inbuf[GIT_OID_RAWSZ];
73
	size_t inbuf_len;
74
	git_hash_ctx trailer;
75 76 77
};

struct delta_info {
78
	git_off_t delta_off;
79 80
};

81
const git_oid *git_indexer_hash(const git_indexer *idx)
82 83 84 85
{
	return &idx->hash;
}

86
static int parse_header(struct git_pack_header *hdr, struct git_pack_file *pack)
87 88
{
	int error;
89
	git_map map;
90

91
	if ((error = p_mmap(&map, sizeof(*hdr), GIT_PROT_READ, GIT_MAP_SHARED, pack->mwf.fd, 0)) < 0)
92
		return error;
93

94 95 96 97
	memcpy(hdr, map.data, sizeof(*hdr));
	p_munmap(&map);

	/* Verify we recognize this pack file format. */
98
	if (hdr->hdr_signature != ntohl(PACK_SIGNATURE)) {
99
		git_error_set(GIT_ERROR_INDEXER, "wrong pack signature");
100 101
		return -1;
	}
102

103
	if (!pack_version_ok(hdr->hdr_version)) {
104
		git_error_set(GIT_ERROR_INDEXER, "wrong pack version");
105 106
		return -1;
	}
Carlos Martín Nieto committed
107

108
	return 0;
109 110
}

111
static int objects_cmp(const void *a, const void *b)
112 113 114 115
{
	const struct entry *entrya = a;
	const struct entry *entryb = b;

116
	return git_oid__cmp(&entrya->oid, &entryb->oid);
117 118
}

119 120 121 122 123 124 125
int git_indexer_init_options(git_indexer_options *opts, unsigned int version)
{
	GIT_INIT_STRUCTURE_FROM_TEMPLATE(
		opts, version, git_indexer_options, GIT_INDEXER_OPTIONS_INIT);
	return 0;
}

126 127
int git_indexer_new(
		git_indexer **out,
128
		const char *prefix,
129
		unsigned int mode,
130
		git_odb *odb,
131
		git_indexer_options *in_opts)
132
{
133
	git_indexer_options opts = GIT_INDEXER_OPTIONS_INIT;
134
	git_indexer *idx;
135
	git_buf path = GIT_BUF_INIT, tmp_path = GIT_BUF_INIT;
136
	static const char suff[] = "/pack";
137
	int error, fd = -1;
138

139 140 141
	if (in_opts)
		memcpy(&opts, in_opts, sizeof(opts));

142
	idx = git__calloc(1, sizeof(git_indexer));
143
	GIT_ERROR_CHECK_ALLOC(idx);
144
	idx->odb = odb;
145 146
	idx->progress_cb = opts.progress_cb;
	idx->progress_payload = opts.progress_cb_payload;
147
	idx->mode = mode ? mode : GIT_PACK_FILE_MODE;
148
	git_hash_ctx_init(&idx->hash_ctx);
149
	git_hash_ctx_init(&idx->trailer);
150
	git_buf_init(&idx->entry_data, 0);
151 152 153

	if ((error = git_oidmap_new(&idx->expected_oids)) < 0)
		goto cleanup;
154

155
	idx->do_verify = opts.verify;
156

157
	if (git_repository__fsync_gitdir)
158 159
		idx->do_fsync = 1;

160 161 162 163
	error = git_buf_joinpath(&path, prefix, suff);
	if (error < 0)
		goto cleanup;

164
	fd = git_futils_mktmp(&tmp_path, git_buf_cstr(&path), idx->mode);
165
	git_buf_dispose(&path);
166 167 168 169
	if (fd < 0)
		goto cleanup;

	error = git_packfile_alloc(&idx->pack, git_buf_cstr(&tmp_path));
170
	git_buf_dispose(&tmp_path);
171

172 173 174
	if (error < 0)
		goto cleanup;

175 176 177 178
	idx->pack->mwf.fd = fd;
	if ((error = git_mwindow_file_register(&idx->pack->mwf)) < 0)
		goto cleanup;

179 180 181 182
	*out = idx;
	return 0;

cleanup:
183 184 185
	if (fd != -1)
		p_close(fd);

lhchavez committed
186 187
	if (git_buf_len(&tmp_path) > 0)
		p_unlink(git_buf_cstr(&tmp_path));
188 189

	if (idx->pack != NULL)
lhchavez committed
190
		p_unlink(idx->pack->pack_name);
191

192 193
	git_buf_dispose(&path);
	git_buf_dispose(&tmp_path);
194 195 196 197
	git__free(idx);
	return -1;
}

198 199 200 201 202
void git_indexer__set_fsync(git_indexer *idx, int do_fsync)
{
	idx->do_fsync = !!do_fsync;
}

203
/* Try to store the delta so we can try to resolve it later */
204
static int store_delta(git_indexer *idx)
205
{
206 207
	struct delta_info *delta;

208
	delta = git__calloc(1, sizeof(struct delta_info));
209
	GIT_ERROR_CHECK_ALLOC(delta);
210
	delta->delta_off = idx->entry_start;
211

212
	if (git_vector_insert(&idx->deltas, delta) < 0)
213 214 215 216 217
		return -1;

	return 0;
}

218
static int hash_header(git_hash_ctx *ctx, git_off_t len, git_object_t type)
219 220 221
{
	char buffer[64];
	size_t hdrlen;
222 223 224 225 226
	int error;

	if ((error = git_odb__format_object_header(&hdrlen,
		buffer, sizeof(buffer), (size_t)len, type)) < 0)
		return error;
227

228
	return git_hash_update(ctx, buffer, hdrlen);
229 230
}

231
static int hash_object_stream(git_indexer*idx, git_packfile_stream *stream)
232 233 234
{
	ssize_t read;

235
	assert(idx && stream);
236 237

	do {
238
		if ((read = git_packfile_stream_read(stream, idx->objbuf, sizeof(idx->objbuf))) < 0)
239 240
			break;

241 242 243
		if (idx->do_verify)
			git_buf_put(&idx->entry_data, idx->objbuf, read);

244
		git_hash_update(&idx->hash_ctx, idx->objbuf, read);
245 246 247 248 249 250 251 252
	} while (read > 0);

	if (read < 0)
		return (int)read;

	return 0;
}

253
/* In order to create the packfile stream, we need to skip over the delta base description */
254
static int advance_delta_offset(git_indexer *idx, git_object_t type)
255 256 257
{
	git_mwindow *w = NULL;

258
	assert(type == GIT_OBJECT_REF_DELTA || type == GIT_OBJECT_OFS_DELTA);
259

260
	if (type == GIT_OBJECT_REF_DELTA) {
261 262 263 264 265 266 267 268 269 270 271 272
		idx->off += GIT_OID_RAWSZ;
	} else {
		git_off_t base_off = get_delta_base(idx->pack, &w, &idx->off, type, idx->entry_start);
		git_mwindow_close(&w);
		if (base_off < 0)
			return (int)base_off;
	}

	return 0;
}

/* Read from the stream and discard any output */
273
static int read_object_stream(git_indexer *idx, git_packfile_stream *stream)
274 275 276 277 278 279
{
	ssize_t read;

	assert(stream);

	do {
280
		read = git_packfile_stream_read(stream, idx->objbuf, sizeof(idx->objbuf));
281 282 283 284 285 286 287 288
	} while (read > 0);

	if (read < 0)
		return (int)read;

	return 0;
}

289 290 291 292 293 294 295 296 297
static int crc_object(uint32_t *crc_out, git_mwindow_file *mwf, git_off_t start, git_off_t size)
{
	void *ptr;
	uint32_t crc;
	unsigned int left, len;
	git_mwindow *w = NULL;

	crc = crc32(0L, Z_NULL, 0);
	while (size) {
298
		ptr = git_mwindow_open(mwf, &w, start, (size_t)size, &left);
299 300 301
		if (ptr == NULL)
			return -1;

302
		len = min(left, (unsigned int)size);
303 304 305 306 307 308 309 310 311 312
		crc = crc32(crc, ptr, len);
		size -= len;
		start += len;
		git_mwindow_close(&w);
	}

	*crc_out = htonl(crc);
	return 0;
}

313
static int add_expected_oid(git_indexer *idx, const git_oid *oid)
314 315 316 317 318 319
{
	/*
	 * If we know about that object because it is stored in our ODB or
	 * because we have already processed it as part of our pack file, we do
	 * not have to expect it.
	 */
320
	if ((!idx->odb || !git_odb_exists(idx->odb, oid)) &&
321 322 323 324
	    !git_oidmap_exists(idx->pack->idx_cache, oid) &&
	    !git_oidmap_exists(idx->expected_oids, oid)) {
		    git_oid *dup = git__malloc(sizeof(*oid));
		    git_oid_cpy(dup, oid);
325
		    return git_oidmap_set(idx->expected_oids, dup, dup);
326
	}
327 328

	return 0;
329 330 331 332 333
}

static int check_object_connectivity(git_indexer *idx, const git_rawobj *obj)
{
	git_object *object;
334
	git_oid *expected;
335 336
	int error;

337 338 339 340
	if (obj->type != GIT_OBJECT_BLOB &&
	    obj->type != GIT_OBJECT_TREE &&
	    obj->type != GIT_OBJECT_COMMIT &&
	    obj->type != GIT_OBJECT_TAG)
341 342 343 344 345
		return 0;

	if ((error = git_object__from_raw(&object, obj->data, obj->len, obj->type)) < 0)
		goto out;

346 347 348
	if ((expected = git_oidmap_get(idx->expected_oids, &object->cached.oid)) != NULL) {
		git_oidmap_delete(idx->expected_oids, &object->cached.oid);
		git__free(expected);
349 350 351 352 353 354
	}

	/*
	 * Check whether this is a known object. If so, we can just continue as
	 * we assume that the ODB has a complete graph.
	 */
355
	if (idx->odb && git_odb_exists(idx->odb, &object->cached.oid))
356 357 358
		return 0;

	switch (obj->type) {
359
		case GIT_OBJECT_TREE:
360 361 362 363 364 365
		{
			git_tree *tree = (git_tree *) object;
			git_tree_entry *entry;
			size_t i;

			git_array_foreach(tree->entries, i, entry)
366 367
				if (add_expected_oid(idx, entry->oid) < 0)
					goto out;
368 369 370

			break;
		}
371
		case GIT_OBJECT_COMMIT:
372 373 374 375 376 377
		{
			git_commit *commit = (git_commit *) object;
			git_oid *parent_oid;
			size_t i;

			git_array_foreach(commit->parent_ids, i, parent_oid)
378 379
				if (add_expected_oid(idx, parent_oid) < 0)
					goto out;
380

381 382
			if (add_expected_oid(idx, &commit->tree_id) < 0)
				goto out;
383 384 385

			break;
		}
386
		case GIT_OBJECT_TAG:
387 388 389
		{
			git_tag *tag = (git_tag *) object;

390 391
			if (add_expected_oid(idx, &tag->target) < 0)
				goto out;
392 393 394

			break;
		}
395
		case GIT_OBJECT_BLOB:
396 397 398 399 400 401 402 403 404 405
		default:
			break;
	}

out:
	git_object_free(object);

	return error;
}

406
static int store_object(git_indexer *idx)
407
{
408
	int i, error;
409 410 411 412
	git_oid oid;
	struct entry *entry;
	git_off_t entry_size;
	struct git_pack_entry *pentry;
413
	git_off_t entry_start = idx->entry_start;
414 415

	entry = git__calloc(1, sizeof(*entry));
416
	GIT_ERROR_CHECK_ALLOC(entry);
417

Linquize committed
418
	pentry = git__calloc(1, sizeof(struct git_pack_entry));
419
	GIT_ERROR_CHECK_ALLOC(pentry);
420

421
	git_hash_final(&oid, &idx->hash_ctx);
422 423 424 425 426 427 428 429
	entry_size = idx->off - entry_start;
	if (entry_start > UINT31_MAX) {
		entry->offset = UINT32_MAX;
		entry->offset_long = entry_start;
	} else {
		entry->offset = (uint32_t)entry_start;
	}

430 431 432 433 434 435 436 437 438 439 440
	if (idx->do_verify) {
		git_rawobj rawobj = {
		    idx->entry_data.ptr,
		    idx->entry_data.size,
		    idx->entry_type
		};

		if ((error = check_object_connectivity(idx, &rawobj)) < 0)
			goto on_error;
	}

441 442
	git_oid_cpy(&pentry->sha1, &oid);
	pentry->offset = entry_start;
443

444 445
	if (git_oidmap_exists(idx->pack->idx_cache, &pentry->sha1)) {
		git_error_set(GIT_ERROR_INDEXER, "duplicate object %s found in pack", git_oid_tostr_s(&pentry->sha1));
446
		git__free(pentry);
447
		goto on_error;
448
	}
449

450
	if ((error = git_oidmap_set(idx->pack->idx_cache, &pentry->sha1, pentry)) < 0) {
451
		git__free(pentry);
452
		git_error_set_oom();
453 454 455
		goto on_error;
	}

456 457
	git_oid_cpy(&entry->oid, &oid);

458
	if (crc_object(&entry->crc, &idx->pack->mwf, entry_start, entry_size) < 0)
459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476
		goto on_error;

	/* Add the object to the list */
	if (git_vector_insert(&idx->objects, entry) < 0)
		goto on_error;

	for (i = oid.id[0]; i < 256; ++i) {
		idx->fanout[i]++;
	}

	return 0;

on_error:
	git__free(entry);

	return -1;
}

477 478
GIT_INLINE(bool) has_entry(git_indexer *idx, git_oid *id)
{
479
	return git_oidmap_exists(idx->pack->idx_cache, id);
480 481
}

482
static int save_entry(git_indexer *idx, struct entry *entry, struct git_pack_entry *pentry, git_off_t entry_start)
483
{
484
	int i;
485 486 487 488 489 490 491 492

	if (entry_start > UINT31_MAX) {
		entry->offset = UINT32_MAX;
		entry->offset_long = entry_start;
	} else {
		entry->offset = (uint32_t)entry_start;
	}

493
	pentry->offset = entry_start;
494

495 496
	if (git_oidmap_exists(idx->pack->idx_cache, &pentry->sha1) ||
	    git_oidmap_set(idx->pack->idx_cache, &pentry->sha1, pentry) < 0) {
497
		git_error_set(GIT_ERROR_INDEXER, "cannot insert object into pack");
498
		return -1;
499
	}
500 501 502 503 504 505 506 507 508 509 510 511

	/* Add the object to the list */
	if (git_vector_insert(&idx->objects, entry) < 0)
		return -1;

	for (i = entry->oid.id[0]; i < 256; ++i) {
		idx->fanout[i]++;
	}

	return 0;
}

512
static int hash_and_save(git_indexer *idx, git_rawobj *obj, git_off_t entry_start)
513 514 515 516
{
	git_oid oid;
	size_t entry_size;
	struct entry *entry;
517
	struct git_pack_entry *pentry = NULL;
518 519

	entry = git__calloc(1, sizeof(*entry));
520
	GIT_ERROR_CHECK_ALLOC(entry);
521

522
	if (git_odb__hashobj(&oid, obj) < 0) {
523
		git_error_set(GIT_ERROR_INDEXER, "failed to hash object");
524
		goto on_error;
525 526
	}

Linquize committed
527
	pentry = git__calloc(1, sizeof(struct git_pack_entry));
528
	GIT_ERROR_CHECK_ALLOC(pentry);
529 530 531 532 533 534

	git_oid_cpy(&pentry->sha1, &oid);
	git_oid_cpy(&entry->oid, &oid);
	entry->crc = crc32(0L, Z_NULL, 0);

	entry_size = (size_t)(idx->off - entry_start);
535
	if (crc_object(&entry->crc, &idx->pack->mwf, entry_start, entry_size) < 0)
536 537
		goto on_error;

538
	return save_entry(idx, entry, pentry, entry_start);
539

540
on_error:
541
	git__free(pentry);
542 543
	git__free(entry);
	git__free(obj->data);
544 545
	return -1;
}
546

547
static int do_progress_callback(git_indexer *idx, git_indexer_progress *stats)
548
{
549
	if (idx->progress_cb)
550
		return git_error_set_after_callback_function(
551 552
			idx->progress_cb(stats, idx->progress_payload),
			"indexer progress");
553
	return 0;
554 555
}

556
/* Hash everything but the last 20B of input */
557
static void hash_partially(git_indexer *idx, const uint8_t *data, size_t size)
558
{
559
	size_t to_expell, to_keep;
560 561 562 563 564

	if (size == 0)
		return;

	/* Easy case, dump the buffer and the data minus the last 20 bytes */
565
	if (size >= GIT_OID_RAWSZ) {
566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582
		git_hash_update(&idx->trailer, idx->inbuf, idx->inbuf_len);
		git_hash_update(&idx->trailer, data, size - GIT_OID_RAWSZ);

		data += size - GIT_OID_RAWSZ;
		memcpy(idx->inbuf, data, GIT_OID_RAWSZ);
		idx->inbuf_len = GIT_OID_RAWSZ;
		return;
	}

	/* We can just append */
	if (idx->inbuf_len + size <= GIT_OID_RAWSZ) {
		memcpy(idx->inbuf + idx->inbuf_len, data, size);
		idx->inbuf_len += size;
		return;
	}

	/* We need to partially drain the buffer and then append */
583 584
	to_keep   = GIT_OID_RAWSZ - size;
	to_expell = idx->inbuf_len - to_keep;
585 586 587 588 589 590 591 592

	git_hash_update(&idx->trailer, idx->inbuf, to_expell);

	memmove(idx->inbuf, idx->inbuf + to_expell, to_keep);
	memcpy(idx->inbuf + to_keep, data, size);
	idx->inbuf_len += size - to_expell;
}

593 594 595
static int write_at(git_indexer *idx, const void *data, git_off_t offset, size_t size)
{
	git_file fd = idx->pack->mwf.fd;
596
	size_t mmap_alignment;
597 598
	size_t page_offset;
	git_off_t page_start;
599
	unsigned char *map_data;
600 601 602
	git_map map;
	int error;

603 604
	assert(data && size);

605
	if ((error = git__mmap_alignment(&mmap_alignment)) < 0)
606 607
		return error;

608 609
	/* the offset needs to be at the mmap boundary for the platform */
	page_offset = offset % mmap_alignment;
610
	page_start = offset - page_offset;
611 612 613 614

	if ((error = p_mmap(&map, page_offset + size, GIT_PROT_WRITE, GIT_MAP_SHARED, fd, page_start)) < 0)
		return error;

615 616
	map_data = (unsigned char *)map.data;
	memcpy(map_data + page_offset, data, size);
617 618 619 620 621 622 623
	p_munmap(&map);

	return 0;
}

static int append_to_pack(git_indexer *idx, const void *data, size_t size)
{
624 625 626 627
	git_off_t new_size;
	size_t mmap_alignment;
	size_t page_offset;
	git_off_t page_start;
628
	git_off_t current_size = idx->pack->mwf.size;
629
	int fd = idx->pack->mwf.fd;
630
	int error;
631

632 633 634
	if (!size)
		return 0;

635 636 637 638 639 640 641 642 643 644 645 646
	if ((error = git__mmap_alignment(&mmap_alignment)) < 0)
		return error;

	/* Write a single byte to force the file system to allocate space now or
	 * report an error, since we can't report errors when writing using mmap.
	 * Round the size up to the nearest page so that we only need to perform file
	 * I/O when we add a page, instead of whenever we write even a single byte. */
	new_size = current_size + size;
	page_offset = new_size % mmap_alignment;
	page_start = new_size - page_offset;

	if (p_lseek(fd, page_start + mmap_alignment - 1, SEEK_SET) < 0 ||
647
	    p_write(idx->pack->mwf.fd, data, 1) < 0) {
648
		git_error_set(GIT_ERROR_OS, "cannot extend packfile '%s'", idx->pack->pack_name);
649 650 651 652 653 654
		return -1;
	}

	return write_at(idx, data, idx->pack->mwf.size, size);
}

655
static int read_stream_object(git_indexer *idx, git_indexer_progress *stats)
656 657 658 659
{
	git_packfile_stream *stream = &idx->stream;
	git_off_t entry_start = idx->off;
	size_t entry_size;
660
	git_object_t type;
661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678
	git_mwindow *w = NULL;
	int error;

	if (idx->pack->mwf.size <= idx->off + 20)
		return GIT_EBUFS;

	if (!idx->have_stream) {
		error = git_packfile_unpack_header(&entry_size, &type, &idx->pack->mwf, &w, &idx->off);
		if (error == GIT_EBUFS) {
			idx->off = entry_start;
			return error;
		}
		if (error < 0)
			return error;

		git_mwindow_close(&w);
		idx->entry_start = entry_start;
		git_hash_init(&idx->hash_ctx);
679
		git_buf_clear(&idx->entry_data);
680

681
		if (type == GIT_OBJECT_REF_DELTA || type == GIT_OBJECT_OFS_DELTA) {
682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699
			error = advance_delta_offset(idx, type);
			if (error == GIT_EBUFS) {
				idx->off = entry_start;
				return error;
			}
			if (error < 0)
				return error;

			idx->have_delta = 1;
		} else {
			idx->have_delta = 0;

			error = hash_header(&idx->hash_ctx, entry_size, type);
			if (error < 0)
				return error;
		}

		idx->have_stream = 1;
700
		idx->entry_type = type;
701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743

		error = git_packfile_stream_open(stream, idx->pack, idx->off);
		if (error < 0)
			return error;
	}

	if (idx->have_delta) {
		error = read_object_stream(idx, stream);
	} else {
		error = hash_object_stream(idx, stream);
	}

	idx->off = stream->curpos;
	if (error == GIT_EBUFS)
		return error;

	/* We want to free the stream reasorces no matter what here */
	idx->have_stream = 0;
	git_packfile_stream_dispose(stream);

	if (error < 0)
		return error;

	if (idx->have_delta) {
		error = store_delta(idx);
	} else {
		error = store_object(idx);
	}

	if (error < 0)
		return error;

	if (!idx->have_delta) {
		stats->indexed_objects++;
	}
	stats->received_objects++;

	if ((error = do_progress_callback(idx, stats)) != 0)
		return error;

	return 0;
}

744
int git_indexer_append(git_indexer *idx, const void *data, size_t size, git_indexer_progress *stats)
745
{
746
	int error = -1;
747
	struct git_pack_header *hdr = &idx->hdr;
748
	git_mwindow_file *mwf = &idx->pack->mwf;
749

750 751
	assert(idx && data && stats);

752
	if ((error = append_to_pack(idx, data, size)) < 0)
753
		return error;
754

nulltoken committed
755
	hash_partially(idx, data, (int)size);
756

757
	/* Make sure we set the new size of the pack */
758
	idx->pack->mwf.size += size;
759 760

	if (!idx->parsed_header) {
761 762
		unsigned int total_objects;

763
		if ((unsigned)idx->pack->mwf.size < sizeof(struct git_pack_header))
764 765
			return 0;

766 767
		if ((error = parse_header(&idx->hdr, idx->pack)) < 0)
			return error;
768 769

		idx->parsed_header = 1;
770
		idx->nr_objects = ntohl(hdr->hdr_entries);
771 772
		idx->off = sizeof(struct git_pack_header);

773
		if (idx->nr_objects <= git_indexer__max_objects) {
774
			total_objects = (unsigned int)idx->nr_objects;
775
		} else {
776
			git_error_set(GIT_ERROR_INDEXER, "too many objects");
777
			return -1;
778
		}
779

780 781
		if (git_oidmap_new(&idx->pack->idx_cache) < 0)
			return -1;
782 783

		idx->pack->has_cache = 1;
784
		if (git_vector_init(&idx->objects, total_objects, objects_cmp) < 0)
785 786
			return -1;

787
		if (git_vector_init(&idx->deltas, total_objects / 2, NULL) < 0)
788 789
			return -1;

790
		stats->received_objects = 0;
791
		stats->local_objects = 0;
792 793
		stats->total_deltas = 0;
		stats->indexed_deltas = 0;
794
		stats->indexed_objects = 0;
795
		stats->total_objects = total_objects;
796

797
		if ((error = do_progress_callback(idx, stats)) != 0)
798
			return error;
799 800 801 802 803 804
	}

	/* Now that we have data in the pack, let's try to parse it */

	/* As the file grows any windows we try to use will be out of date */
	git_mwindow_free_all(mwf);
805

806
	while (stats->indexed_objects < idx->nr_objects) {
807 808 809 810
		if ((error = read_stream_object(idx, stats)) != 0) {
			if (error == GIT_EBUFS)
				break;
			else
811
				goto on_error;
812
		}
813
	}
814

815
	return 0;
816

817 818
on_error:
	git_mwindow_free_all(mwf);
819
	return error;
820
}
821

822
static int index_path(git_buf *path, git_indexer *idx, const char *suffix)
823 824 825
{
	const char prefix[] = "pack-";
	size_t slash = (size_t)path->size;
826

827 828 829
	/* search backwards for '/' */
	while (slash > 0 && path->ptr[slash - 1] != '/')
		slash--;
830

831 832 833 834 835 836
	if (git_buf_grow(path, slash + 1 + strlen(prefix) +
					 GIT_OID_HEXSZ + strlen(suffix) + 1) < 0)
		return -1;

	git_buf_truncate(path, slash);
	git_buf_puts(path, prefix);
nulltoken committed
837
	git_oid_fmt(path->ptr + git_buf_len(path), &idx->hash);
838 839 840 841 842 843
	path->size += GIT_OID_HEXSZ;
	git_buf_puts(path, suffix);

	return git_buf_oom(path) ? -1 : 0;
}

844 845 846 847
/**
 * Rewind the packfile by the trailer, as we might need to fix the
 * packfile by injecting objects at the tail and must overwrite it.
 */
848
static void seek_back_trailer(git_indexer *idx)
849 850 851 852 853
{
	idx->pack->mwf.size -= GIT_OID_RAWSZ;
	git_mwindow_free_all(&idx->pack->mwf);
}

854
static int inject_object(git_indexer *idx, git_oid *id)
855
{
856 857
	git_odb_object *obj;
	struct entry *entry;
858
	struct git_pack_entry *pentry = NULL;
859 860 861 862 863 864 865 866
	git_oid foo = {{0}};
	unsigned char hdr[64];
	git_buf buf = GIT_BUF_INIT;
	git_off_t entry_start;
	const void *data;
	size_t len, hdr_len;
	int error;

867 868
	seek_back_trailer(idx);
	entry_start = idx->pack->mwf.size;
869

870
	if (git_odb_read(&obj, idx->odb, id) < 0) {
871
		git_error_set(GIT_ERROR_INDEXER, "missing delta bases");
872
		return -1;
873
	}
874 875 876 877

	data = git_odb_object_data(obj);
	len = git_odb_object_size(obj);

878
	entry = git__calloc(1, sizeof(*entry));
879
	GIT_ERROR_CHECK_ALLOC(entry);
880

881 882 883 884
	entry->crc = crc32(0L, Z_NULL, 0);

	/* Write out the object header */
	hdr_len = git_packfile__object_header(hdr, len, git_odb_object_type(obj));
885 886 887
	if ((error = append_to_pack(idx, hdr, hdr_len)) < 0)
		goto cleanup;

888
	idx->pack->mwf.size += hdr_len;
889
	entry->crc = crc32(entry->crc, hdr, (uInt)hdr_len);
890

891
	if ((error = git_zstream_deflatebuf(&buf, data, len)) < 0)
892 893 894
		goto cleanup;

	/* And then the compressed object */
895 896 897
	if ((error = append_to_pack(idx, buf.ptr, buf.size)) < 0)
		goto cleanup;

898
	idx->pack->mwf.size += buf.size;
Linquize committed
899
	entry->crc = htonl(crc32(entry->crc, (unsigned char *)buf.ptr, (uInt)buf.size));
900
	git_buf_dispose(&buf);
901 902

	/* Write a fake trailer so the pack functions play ball */
903 904

	if ((error = append_to_pack(idx, &foo, GIT_OID_RAWSZ)) < 0)
905 906 907 908 909
		goto cleanup;

	idx->pack->mwf.size += GIT_OID_RAWSZ;

	pentry = git__calloc(1, sizeof(struct git_pack_entry));
910
	GIT_ERROR_CHECK_ALLOC(pentry);
911 912 913 914 915

	git_oid_cpy(&pentry->sha1, id);
	git_oid_cpy(&entry->oid, id);
	idx->off = entry_start + hdr_len + len;

916
	error = save_entry(idx, entry, pentry, entry_start);
917 918

cleanup:
919 920 921 922
	if (error) {
		git__free(entry);
		git__free(pentry);
	}
923

924 925 926 927
	git_odb_object_free(obj);
	return error;
}

928
static int fix_thin_pack(git_indexer *idx, git_indexer_progress *stats)
929
{
930
	int error, found_ref_delta = 0;
931 932
	unsigned int i;
	struct delta_info *delta;
933
	size_t size;
934
	git_object_t type;
935
	git_mwindow *w = NULL;
Linquize committed
936
	git_off_t curpos = 0;
937 938 939 940 941
	unsigned char *base_info;
	unsigned int left = 0;
	git_oid base;

	assert(git_vector_length(&idx->deltas) > 0);
942 943

	if (idx->odb == NULL) {
944
		git_error_set(GIT_ERROR_INDEXER, "cannot fix a thin pack without an ODB");
945 946
		return -1;
	}
947

948
	/* Loop until we find the first REF delta */
949
	git_vector_foreach(&idx->deltas, i, delta) {
950 951 952
		if (!delta)
			continue;

953
		curpos = delta->delta_off;
954 955 956 957
		error = git_packfile_unpack_header(&size, &type, &idx->pack->mwf, &w, &curpos);
		if (error < 0)
			return error;

958
		if (type == GIT_OBJECT_REF_DELTA) {
959 960
			found_ref_delta = 1;
			break;
961
		}
962
	}
963

964
	if (!found_ref_delta) {
965
		git_error_set(GIT_ERROR_INDEXER, "no REF_DELTA found, cannot inject object");
966 967
		return -1;
	}
968

969 970 971
	/* curpos now points to the base information, which is an OID */
	base_info = git_mwindow_open(&idx->pack->mwf, &w, curpos, GIT_OID_RAWSZ, &left);
	if (base_info == NULL) {
972
		git_error_set(GIT_ERROR_INDEXER, "failed to map delta information");
973 974
		return -1;
	}
975

976 977
	git_oid_fromraw(&base, base_info);
	git_mwindow_close(&w);
978

979 980 981
	if (has_entry(idx, &base))
		return 0;

982 983 984 985
	if (inject_object(idx, &base) < 0)
		return -1;

	stats->local_objects++;
986 987 988 989

	return 0;
}

990
static int resolve_deltas(git_indexer *idx, git_indexer_progress *stats)
991 992
{
	unsigned int i;
lhchavez committed
993
	int error;
994
	struct delta_info *delta;
995
	int progressed = 0, non_null = 0, progress_cb_result;
996 997 998

	while (idx->deltas.length > 0) {
		progressed = 0;
999
		non_null = 0;
1000
		git_vector_foreach(&idx->deltas, i, delta) {
1001
			git_rawobj obj = {0};
1002

1003 1004 1005 1006
			if (!delta)
				continue;

			non_null = 1;
1007
			idx->off = delta->delta_off;
lhchavez committed
1008 1009 1010 1011 1012 1013 1014
			if ((error = git_packfile_unpack(&obj, idx->pack, &idx->off)) < 0) {
				if (error == GIT_PASSTHROUGH) {
					/* We have not seen the base object, we'll try again later. */
					continue;
				}
				return -1;
			}
1015

1016 1017 1018 1019
			if (idx->do_verify && check_object_connectivity(idx, &obj) < 0)
				/* TODO: error? continue? */
				continue;

1020 1021 1022 1023 1024
			if (hash_and_save(idx, &obj, delta->delta_off) < 0)
				continue;

			git__free(obj.data);
			stats->indexed_objects++;
1025
			stats->indexed_deltas++;
1026
			progressed = 1;
1027 1028
			if ((progress_cb_result = do_progress_callback(idx, stats)) < 0)
				return progress_cb_result;
1029

1030 1031
			/* remove from the list */
			git_vector_set(NULL, &idx->deltas, i, NULL);
1032
			git__free(delta);
1033
		}
1034

1035 1036 1037 1038
		/* if none were actually set, we're done */
		if (!non_null)
			break;

1039
		if (!progressed && (fix_thin_pack(idx, stats) < 0)) {
1040
			return -1;
1041
		}
1042 1043 1044 1045 1046
	}

	return 0;
}

1047
static int update_header_and_rehash(git_indexer *idx, git_indexer_progress *stats)
1048 1049 1050 1051 1052 1053 1054 1055 1056 1057
{
	void *ptr;
	size_t chunk = 1024*1024;
	git_off_t hashed = 0;
	git_mwindow *w = NULL;
	git_mwindow_file *mwf;
	unsigned int left;

	mwf = &idx->pack->mwf;

1058
	git_hash_init(&idx->trailer);
1059

1060 1061

	/* Update the header to include the numer of local objects we injected */
1062
	idx->hdr.hdr_entries = htonl(stats->total_objects + stats->local_objects);
1063
	if (write_at(idx, &idx->hdr, 0, sizeof(struct git_pack_header)) < 0)
1064
		return -1;
1065

1066 1067 1068 1069 1070 1071
	/*
	 * We now use the same technique as before to determine the
	 * hash. We keep reading up to the end and let
	 * hash_partially() keep the existing trailer out of the
	 * calculation.
	 */
1072
	git_mwindow_free_all(mwf);
1073 1074 1075 1076
	idx->inbuf_len = 0;
	while (hashed < mwf->size) {
		ptr = git_mwindow_open(mwf, &w, hashed, chunk, &left);
		if (ptr == NULL)
1077
			return -1;
1078

1079 1080 1081 1082
		hash_partially(idx, ptr, left);
		hashed += left;

		git_mwindow_close(&w);
1083
	}
1084

1085 1086 1087
	return 0;
}

1088
int git_indexer_commit(git_indexer *idx, git_indexer_progress *stats)
1089 1090 1091
{
	git_mwindow *w = NULL;
	unsigned int i, long_offsets = 0, left;
1092
	int error;
1093 1094 1095
	struct git_pack_idx_header hdr;
	git_buf filename = GIT_BUF_INIT;
	struct entry *entry;
1096
	git_oid trailer_hash, file_hash;
1097
	git_filebuf index_file = {0};
1098
	void *packfile_trailer;
1099

1100
	if (!idx->parsed_header) {
1101
		git_error_set(GIT_ERROR_INDEXER, "incomplete pack header");
1102 1103 1104
		return -1;
	}

1105
	/* Test for this before resolve_deltas(), as it plays with idx->off */
1106
	if (idx->off + 20 < idx->pack->mwf.size) {
1107
		git_error_set(GIT_ERROR_INDEXER, "unexpected data at the end of the pack");
1108 1109
		return -1;
	}
1110
	if (idx->off + 20 > idx->pack->mwf.size) {
1111
		git_error_set(GIT_ERROR_INDEXER, "missing trailer at the end of the pack");
1112 1113
		return -1;
	}
1114

1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126
	packfile_trailer = git_mwindow_open(&idx->pack->mwf, &w, idx->pack->mwf.size - GIT_OID_RAWSZ, GIT_OID_RAWSZ, &left);
	if (packfile_trailer == NULL) {
		git_mwindow_close(&w);
		goto on_error;
	}

	/* Compare the packfile trailer as it was sent to us and what we calculated */
	git_oid_fromraw(&file_hash, packfile_trailer);
	git_mwindow_close(&w);

	git_hash_final(&trailer_hash, &idx->trailer);
	if (git_oid_cmp(&file_hash, &trailer_hash)) {
1127
		git_error_set(GIT_ERROR_INDEXER, "packfile trailer mismatch");
1128 1129 1130
		return -1;
	}

1131 1132 1133
	/* Freeze the number of deltas */
	stats->total_deltas = stats->total_objects - stats->indexed_objects;

1134 1135
	if ((error = resolve_deltas(idx, stats)) < 0)
		return error;
1136

1137
	if (stats->indexed_objects != stats->total_objects) {
1138
		git_error_set(GIT_ERROR_INDEXER, "early EOF");
1139 1140 1141
		return -1;
	}

1142 1143 1144 1145 1146
	if (stats->local_objects > 0) {
		if (update_header_and_rehash(idx, stats) < 0)
			return -1;

		git_hash_final(&trailer_hash, &idx->trailer);
1147
		write_at(idx, &trailer_hash, idx->pack->mwf.size - GIT_OID_RAWSZ, GIT_OID_RAWSZ);
1148 1149
	}

1150 1151 1152 1153 1154 1155 1156
	/*
	 * Is the resulting graph fully connected or are we still
	 * missing some objects? In the second case, we can
	 * bail out due to an incomplete and thus corrupt
	 * packfile.
	 */
	if (git_oidmap_size(idx->expected_oids) > 0) {
1157
		git_error_set(GIT_ERROR_INDEXER, "packfile is missing %"PRIuZ" objects",
1158 1159 1160 1161
			git_oidmap_size(idx->expected_oids));
		return -1;
	}

1162 1163
	git_vector_sort(&idx->objects);

1164 1165 1166 1167
	/* Use the trailer hash as the pack file name to ensure
	 * files with different contents have different names */
	git_oid_cpy(&idx->hash, &trailer_hash);

1168
	git_buf_sets(&filename, idx->pack->pack_name);
1169
	git_buf_shorten(&filename, strlen("pack"));
1170 1171 1172 1173
	git_buf_puts(&filename, "idx");
	if (git_buf_oom(&filename))
		return -1;

1174
	if (git_filebuf_open(&index_file, filename.ptr,
1175
		GIT_FILEBUF_HASH_CONTENTS |
1176
		(idx->do_fsync ? GIT_FILEBUF_FSYNC : 0),
1177
		idx->mode) < 0)
1178 1179 1180 1181 1182
		goto on_error;

	/* Write out the header */
	hdr.idx_signature = htonl(PACK_IDX_SIGNATURE);
	hdr.idx_version = htonl(2);
1183
	git_filebuf_write(&index_file, &hdr, sizeof(hdr));
1184 1185 1186 1187

	/* Write out the fanout table */
	for (i = 0; i < 256; ++i) {
		uint32_t n = htonl(idx->fanout[i]);
1188
		git_filebuf_write(&index_file, &n, sizeof(n));
1189 1190
	}

1191 1192
	/* Write out the object names (SHA-1 hashes) */
	git_vector_foreach(&idx->objects, i, entry) {
1193
		git_filebuf_write(&index_file, &entry->oid, sizeof(git_oid));
1194 1195 1196 1197
	}

	/* Write out the CRC32 values */
	git_vector_foreach(&idx->objects, i, entry) {
1198
		git_filebuf_write(&index_file, &entry->crc, sizeof(uint32_t));
1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209
	}

	/* Write out the offsets */
	git_vector_foreach(&idx->objects, i, entry) {
		uint32_t n;

		if (entry->offset == UINT32_MAX)
			n = htonl(0x80000000 | long_offsets++);
		else
			n = htonl(entry->offset);

1210
		git_filebuf_write(&index_file, &n, sizeof(uint32_t));
1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222
	}

	/* Write out the long offsets */
	git_vector_foreach(&idx->objects, i, entry) {
		uint32_t split[2];

		if (entry->offset != UINT32_MAX)
			continue;

		split[0] = htonl(entry->offset_long >> 32);
		split[1] = htonl(entry->offset_long & 0xffffffff);

1223
		git_filebuf_write(&index_file, &split, sizeof(uint32_t) * 2);
1224 1225
	}

1226 1227
	/* Write out the packfile trailer to the index */
	if (git_filebuf_write(&index_file, &trailer_hash, GIT_OID_RAWSZ) < 0)
1228 1229
		goto on_error;

1230 1231
	/* Write out the hash of the idx */
	if (git_filebuf_hash(&trailer_hash, &index_file) < 0)
1232 1233
		goto on_error;

1234
	git_filebuf_write(&index_file, &trailer_hash, sizeof(git_oid));
1235 1236

	/* Figure out what the final name should be */
1237
	if (index_path(&filename, idx, ".idx") < 0)
1238 1239 1240
		goto on_error;

	/* Commit file */
1241
	if (git_filebuf_commit_at(&index_file, filename.ptr) < 0)
1242 1243 1244
		goto on_error;

	git_mwindow_free_all(&idx->pack->mwf);
1245 1246 1247

	/* Truncate file to undo rounding up to next page_size in append_to_pack */
	if (p_ftruncate(idx->pack->mwf.fd, idx->pack->mwf.size) < 0) {
1248
		git_error_set(GIT_ERROR_OS, "failed to truncate pack file '%s'", idx->pack->pack_name);
1249 1250 1251
		return -1;
	}

1252
	if (idx->do_fsync && p_fsync(idx->pack->mwf.fd) < 0) {
1253
		git_error_set(GIT_ERROR_OS, "failed to fsync packfile");
1254 1255 1256
		goto on_error;
	}

1257
	/* We need to close the descriptor here so Windows doesn't choke on commit_at */
1258
	if (p_close(idx->pack->mwf.fd) < 0) {
1259
		git_error_set(GIT_ERROR_OS, "failed to close packfile");
1260 1261 1262
		goto on_error;
	}

1263
	idx->pack->mwf.fd = -1;
1264

1265
	if (index_path(&filename, idx, ".pack") < 0)
1266
		goto on_error;
1267

1268
	/* And don't forget to rename the packfile to its new place. */
1269 1270 1271 1272
	if (p_rename(idx->pack->pack_name, git_buf_cstr(&filename)) < 0)
		goto on_error;

	/* And fsync the parent directory if we're asked to. */
1273
	if (idx->do_fsync &&
1274 1275 1276
		git_futils_fsync_parent(git_buf_cstr(&filename)) < 0)
		goto on_error;

1277
	idx->pack_committed = 1;
1278

1279
	git_buf_dispose(&filename);
1280 1281 1282
	return 0;

on_error:
1283
	git_mwindow_free_all(&idx->pack->mwf);
1284
	git_filebuf_cleanup(&index_file);
1285
	git_buf_dispose(&filename);
1286 1287 1288
	return -1;
}

1289
void git_indexer_free(git_indexer *idx)
1290
{
1291 1292 1293
	const git_oid *key;
	git_oid *value;
	size_t iter;
1294

1295 1296 1297
	if (idx == NULL)
		return;

1298
	if (idx->have_stream)
1299
		git_packfile_stream_dispose(&idx->stream);
1300

1301
	git_vector_free_deep(&idx->objects);
1302

1303
	if (idx->pack->idx_cache) {
Russell Belfer committed
1304
		struct git_pack_entry *pentry;
1305 1306 1307
		git_oidmap_foreach_value(idx->pack->idx_cache, pentry, {
			git__free(pentry);
		});
1308 1309

		git_oidmap_free(idx->pack->idx_cache);
1310
	}
1311

1312
	git_vector_free_deep(&idx->deltas);
1313 1314

	if (!git_mutex_lock(&git__mwindow_mutex)) {
1315 1316 1317
		if (!idx->pack_committed)
			git_packfile_close(idx->pack, true);

1318 1319 1320 1321
		git_packfile_free(idx->pack);
		git_mutex_unlock(&git__mwindow_mutex);
	}

1322 1323 1324
	iter = 0;
	while (git_oidmap_iterate((void **) &value, idx->expected_oids, &iter, &key) == 0)
		git__free(value);
1325

1326 1327
	git_hash_ctx_cleanup(&idx->trailer);
	git_hash_ctx_cleanup(&idx->hash_ctx);
1328 1329
	git_buf_dispose(&idx->entry_data);
	git_oidmap_free(idx->expected_oids);
1330 1331
	git__free(idx);
}