indexer.c 31.8 KB
Newer Older
1
/*
Edward Thomson committed
2
 * Copyright (C) the libgit2 contributors. All rights reserved.
3
 *
Vicent Marti committed
4 5
 * This file is part of libgit2, distributed under the GNU GPL v2 with
 * a Linking Exception. For full terms see the included COPYING file.
6 7
 */

8 9
#include "indexer.h"

Carlos Martín Nieto committed
10
#include "git2/indexer.h"
11
#include "git2/object.h"
Carlos Martín Nieto committed
12

13 14 15
#include "commit.h"
#include "tree.h"
#include "tag.h"
16
#include "pack.h"
Carlos Martín Nieto committed
17
#include "mwindow.h"
18
#include "posix.h"
19 20
#include "pack.h"
#include "filebuf.h"
21
#include "oid.h"
22
#include "oidarray.h"
23
#include "oidmap.h"
24
#include "zstream.h"
25
#include "object.h"
26

27
size_t git_indexer__max_objects = UINT32_MAX;
28

29
#define UINT31_MAX (0x7FFFFFFF)
30

31
struct entry {
32
	git_oid oid;
33 34 35 36 37
	uint32_t crc;
	uint32_t offset;
	uint64_t offset_long;
};

38
struct git_indexer {
39
	unsigned int parsed_header :1,
40
		pack_committed :1,
41
		have_stream :1,
42
		have_delta :1,
43 44
		do_fsync :1,
		do_verify :1;
45
	struct git_pack_header hdr;
46
	struct git_pack_file *pack;
47
	unsigned int mode;
48 49
	off64_t off;
	off64_t entry_start;
50
	git_object_t entry_type;
51
	git_buf entry_data;
52
	git_packfile_stream stream;
53 54 55 56
	size_t nr_objects;
	git_vector objects;
	git_vector deltas;
	unsigned int fanout[256];
57
	git_hash_ctx hash_ctx;
58
	git_oid hash;
59
	git_indexer_progress_cb progress_cb;
60
	void *progress_payload;
61
	char objbuf[8*1024];
62

63 64 65
	/* OIDs referenced from pack objects. Used for verification. */
	git_oidmap *expected_oids;

66 67 68
	/* Needed to look up objects which we want to inject to fix a thin pack */
	git_odb *odb;

69 70
	/* Fields for calculating the packfile trailer (hash of everything before it) */
	char inbuf[GIT_OID_RAWSZ];
71
	size_t inbuf_len;
72
	git_hash_ctx trailer;
73 74 75
};

struct delta_info {
76
	off64_t delta_off;
77 78
};

79
const git_oid *git_indexer_hash(const git_indexer *idx)
80 81 82 83
{
	return &idx->hash;
}

84
static int parse_header(struct git_pack_header *hdr, struct git_pack_file *pack)
85 86
{
	int error;
87
	git_map map;
88

89
	if ((error = p_mmap(&map, sizeof(*hdr), GIT_PROT_READ, GIT_MAP_SHARED, pack->mwf.fd, 0)) < 0)
90
		return error;
91

92 93 94 95
	memcpy(hdr, map.data, sizeof(*hdr));
	p_munmap(&map);

	/* Verify we recognize this pack file format. */
96
	if (hdr->hdr_signature != ntohl(PACK_SIGNATURE)) {
97
		git_error_set(GIT_ERROR_INDEXER, "wrong pack signature");
98 99
		return -1;
	}
100

101
	if (!pack_version_ok(hdr->hdr_version)) {
102
		git_error_set(GIT_ERROR_INDEXER, "wrong pack version");
103 104
		return -1;
	}
Carlos Martín Nieto committed
105

106
	return 0;
107 108
}

109
static int objects_cmp(const void *a, const void *b)
110 111 112 113
{
	const struct entry *entrya = a;
	const struct entry *entryb = b;

114
	return git_oid__cmp(&entrya->oid, &entryb->oid);
115 116
}

117
int git_indexer_options_init(git_indexer_options *opts, unsigned int version)
118 119 120 121 122 123
{
	GIT_INIT_STRUCTURE_FROM_TEMPLATE(
		opts, version, git_indexer_options, GIT_INDEXER_OPTIONS_INIT);
	return 0;
}

124
#ifndef GIT_DEPRECATE_HARD
125 126 127 128
int git_indexer_init_options(git_indexer_options *opts, unsigned int version)
{
	return git_indexer_options_init(opts, version);
}
129
#endif
130

131 132
int git_indexer_new(
		git_indexer **out,
133
		const char *prefix,
134
		unsigned int mode,
135
		git_odb *odb,
136
		git_indexer_options *in_opts)
137
{
138
	git_indexer_options opts = GIT_INDEXER_OPTIONS_INIT;
139
	git_indexer *idx;
140
	git_buf path = GIT_BUF_INIT, tmp_path = GIT_BUF_INIT;
141
	static const char suff[] = "/pack";
142
	int error, fd = -1;
143

144 145 146
	if (in_opts)
		memcpy(&opts, in_opts, sizeof(opts));

147
	idx = git__calloc(1, sizeof(git_indexer));
148
	GIT_ERROR_CHECK_ALLOC(idx);
149
	idx->odb = odb;
150 151
	idx->progress_cb = opts.progress_cb;
	idx->progress_payload = opts.progress_cb_payload;
152
	idx->mode = mode ? mode : GIT_PACK_FILE_MODE;
153
	git_buf_init(&idx->entry_data, 0);
154

155 156 157
	if ((error = git_hash_ctx_init(&idx->hash_ctx)) < 0 ||
	    (error = git_hash_ctx_init(&idx->trailer)) < 0 ||
	    (error = git_oidmap_new(&idx->expected_oids)) < 0)
158
		goto cleanup;
159

160
	idx->do_verify = opts.verify;
161

162
	if (git_repository__fsync_gitdir)
163 164
		idx->do_fsync = 1;

165 166 167 168
	error = git_buf_joinpath(&path, prefix, suff);
	if (error < 0)
		goto cleanup;

169
	fd = git_futils_mktmp(&tmp_path, git_buf_cstr(&path), idx->mode);
170
	git_buf_dispose(&path);
171 172 173 174
	if (fd < 0)
		goto cleanup;

	error = git_packfile_alloc(&idx->pack, git_buf_cstr(&tmp_path));
175
	git_buf_dispose(&tmp_path);
176

177 178 179
	if (error < 0)
		goto cleanup;

180 181 182 183
	idx->pack->mwf.fd = fd;
	if ((error = git_mwindow_file_register(&idx->pack->mwf)) < 0)
		goto cleanup;

184 185 186 187
	*out = idx;
	return 0;

cleanup:
188 189 190
	if (fd != -1)
		p_close(fd);

lhchavez committed
191 192
	if (git_buf_len(&tmp_path) > 0)
		p_unlink(git_buf_cstr(&tmp_path));
193 194

	if (idx->pack != NULL)
lhchavez committed
195
		p_unlink(idx->pack->pack_name);
196

197 198
	git_buf_dispose(&path);
	git_buf_dispose(&tmp_path);
199 200 201 202
	git__free(idx);
	return -1;
}

203 204 205 206 207
void git_indexer__set_fsync(git_indexer *idx, int do_fsync)
{
	idx->do_fsync = !!do_fsync;
}

208
/* Try to store the delta so we can try to resolve it later */
209
static int store_delta(git_indexer *idx)
210
{
211 212
	struct delta_info *delta;

213
	delta = git__calloc(1, sizeof(struct delta_info));
214
	GIT_ERROR_CHECK_ALLOC(delta);
215
	delta->delta_off = idx->entry_start;
216

217
	if (git_vector_insert(&idx->deltas, delta) < 0)
218 219 220 221 222
		return -1;

	return 0;
}

223
static int hash_header(git_hash_ctx *ctx, off64_t len, git_object_t type)
224 225 226
{
	char buffer[64];
	size_t hdrlen;
227 228 229 230 231
	int error;

	if ((error = git_odb__format_object_header(&hdrlen,
		buffer, sizeof(buffer), (size_t)len, type)) < 0)
		return error;
232

233
	return git_hash_update(ctx, buffer, hdrlen);
234 235
}

236
static int hash_object_stream(git_indexer*idx, git_packfile_stream *stream)
237 238 239
{
	ssize_t read;

240 241
	GIT_ASSERT_ARG(idx);
	GIT_ASSERT_ARG(stream);
242 243

	do {
244
		if ((read = git_packfile_stream_read(stream, idx->objbuf, sizeof(idx->objbuf))) < 0)
245 246
			break;

247 248 249
		if (idx->do_verify)
			git_buf_put(&idx->entry_data, idx->objbuf, read);

250
		git_hash_update(&idx->hash_ctx, idx->objbuf, read);
251 252 253 254 255 256 257 258
	} while (read > 0);

	if (read < 0)
		return (int)read;

	return 0;
}

259
/* In order to create the packfile stream, we need to skip over the delta base description */
260
static int advance_delta_offset(git_indexer *idx, git_object_t type)
261 262 263
{
	git_mwindow *w = NULL;

264
	GIT_ASSERT_ARG(type == GIT_OBJECT_REF_DELTA || type == GIT_OBJECT_OFS_DELTA);
265

266
	if (type == GIT_OBJECT_REF_DELTA) {
267 268
		idx->off += GIT_OID_RAWSZ;
	} else {
269 270
		off64_t base_off;
		int error = get_delta_base(&base_off, idx->pack, &w, &idx->off, type, idx->entry_start);
271
		git_mwindow_close(&w);
272 273
		if (error < 0)
			return error;
274 275 276 277 278 279
	}

	return 0;
}

/* Read from the stream and discard any output */
280
static int read_object_stream(git_indexer *idx, git_packfile_stream *stream)
281 282 283
{
	ssize_t read;

284
	GIT_ASSERT_ARG(stream);
285 286

	do {
287
		read = git_packfile_stream_read(stream, idx->objbuf, sizeof(idx->objbuf));
288 289 290 291 292 293 294 295
	} while (read > 0);

	if (read < 0)
		return (int)read;

	return 0;
}

296
static int crc_object(uint32_t *crc_out, git_mwindow_file *mwf, off64_t start, off64_t size)
297 298 299 300 301 302 303 304
{
	void *ptr;
	uint32_t crc;
	unsigned int left, len;
	git_mwindow *w = NULL;

	crc = crc32(0L, Z_NULL, 0);
	while (size) {
305
		ptr = git_mwindow_open(mwf, &w, start, (size_t)size, &left);
306 307 308
		if (ptr == NULL)
			return -1;

309
		len = min(left, (unsigned int)size);
310 311 312 313 314 315 316 317 318 319
		crc = crc32(crc, ptr, len);
		size -= len;
		start += len;
		git_mwindow_close(&w);
	}

	*crc_out = htonl(crc);
	return 0;
}

320
static int add_expected_oid(git_indexer *idx, const git_oid *oid)
321 322 323 324 325 326
{
	/*
	 * If we know about that object because it is stored in our ODB or
	 * because we have already processed it as part of our pack file, we do
	 * not have to expect it.
	 */
327
	if ((!idx->odb || !git_odb_exists(idx->odb, oid)) &&
328 329 330
	    !git_oidmap_exists(idx->pack->idx_cache, oid) &&
	    !git_oidmap_exists(idx->expected_oids, oid)) {
		    git_oid *dup = git__malloc(sizeof(*oid));
331
		    GIT_ERROR_CHECK_ALLOC(dup);
332
		    git_oid_cpy(dup, oid);
333
		    return git_oidmap_set(idx->expected_oids, dup, dup);
334
	}
335 336

	return 0;
337 338 339 340 341
}

static int check_object_connectivity(git_indexer *idx, const git_rawobj *obj)
{
	git_object *object;
342
	git_oid *expected;
343 344
	int error;

345 346 347 348
	if (obj->type != GIT_OBJECT_BLOB &&
	    obj->type != GIT_OBJECT_TREE &&
	    obj->type != GIT_OBJECT_COMMIT &&
	    obj->type != GIT_OBJECT_TAG)
349 350 351 352 353
		return 0;

	if ((error = git_object__from_raw(&object, obj->data, obj->len, obj->type)) < 0)
		goto out;

354 355 356
	if ((expected = git_oidmap_get(idx->expected_oids, &object->cached.oid)) != NULL) {
		git_oidmap_delete(idx->expected_oids, &object->cached.oid);
		git__free(expected);
357 358 359 360 361 362
	}

	/*
	 * Check whether this is a known object. If so, we can just continue as
	 * we assume that the ODB has a complete graph.
	 */
363
	if (idx->odb && git_odb_exists(idx->odb, &object->cached.oid))
364 365 366
		return 0;

	switch (obj->type) {
367
		case GIT_OBJECT_TREE:
368 369 370 371 372 373
		{
			git_tree *tree = (git_tree *) object;
			git_tree_entry *entry;
			size_t i;

			git_array_foreach(tree->entries, i, entry)
374 375
				if (add_expected_oid(idx, entry->oid) < 0)
					goto out;
376 377 378

			break;
		}
379
		case GIT_OBJECT_COMMIT:
380 381 382 383 384 385
		{
			git_commit *commit = (git_commit *) object;
			git_oid *parent_oid;
			size_t i;

			git_array_foreach(commit->parent_ids, i, parent_oid)
386 387
				if (add_expected_oid(idx, parent_oid) < 0)
					goto out;
388

389 390
			if (add_expected_oid(idx, &commit->tree_id) < 0)
				goto out;
391 392 393

			break;
		}
394
		case GIT_OBJECT_TAG:
395 396 397
		{
			git_tag *tag = (git_tag *) object;

398 399
			if (add_expected_oid(idx, &tag->target) < 0)
				goto out;
400 401 402

			break;
		}
403
		case GIT_OBJECT_BLOB:
404 405 406 407 408 409 410 411 412 413
		default:
			break;
	}

out:
	git_object_free(object);

	return error;
}

414
static int store_object(git_indexer *idx)
415
{
416
	int i, error;
417 418
	git_oid oid;
	struct entry *entry;
419
	off64_t entry_size;
420
	struct git_pack_entry *pentry;
421
	off64_t entry_start = idx->entry_start;
422 423

	entry = git__calloc(1, sizeof(*entry));
424
	GIT_ERROR_CHECK_ALLOC(entry);
425

Linquize committed
426
	pentry = git__calloc(1, sizeof(struct git_pack_entry));
427
	GIT_ERROR_CHECK_ALLOC(pentry);
428

429 430 431 432
	if (git_hash_final(&oid, &idx->hash_ctx)) {
		git__free(pentry);
		goto on_error;
	}
433 434 435 436 437 438 439 440
	entry_size = idx->off - entry_start;
	if (entry_start > UINT31_MAX) {
		entry->offset = UINT32_MAX;
		entry->offset_long = entry_start;
	} else {
		entry->offset = (uint32_t)entry_start;
	}

441 442 443 444 445 446 447 448 449 450 451
	if (idx->do_verify) {
		git_rawobj rawobj = {
		    idx->entry_data.ptr,
		    idx->entry_data.size,
		    idx->entry_type
		};

		if ((error = check_object_connectivity(idx, &rawobj)) < 0)
			goto on_error;
	}

452 453
	git_oid_cpy(&pentry->sha1, &oid);
	pentry->offset = entry_start;
454

455 456
	if (git_oidmap_exists(idx->pack->idx_cache, &pentry->sha1)) {
		git_error_set(GIT_ERROR_INDEXER, "duplicate object %s found in pack", git_oid_tostr_s(&pentry->sha1));
457
		git__free(pentry);
458
		goto on_error;
459
	}
460

461
	if ((error = git_oidmap_set(idx->pack->idx_cache, &pentry->sha1, pentry)) < 0) {
462
		git__free(pentry);
463
		git_error_set_oom();
464 465 466
		goto on_error;
	}

467 468
	git_oid_cpy(&entry->oid, &oid);

469
	if (crc_object(&entry->crc, &idx->pack->mwf, entry_start, entry_size) < 0)
470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487
		goto on_error;

	/* Add the object to the list */
	if (git_vector_insert(&idx->objects, entry) < 0)
		goto on_error;

	for (i = oid.id[0]; i < 256; ++i) {
		idx->fanout[i]++;
	}

	return 0;

on_error:
	git__free(entry);

	return -1;
}

488 489
GIT_INLINE(bool) has_entry(git_indexer *idx, git_oid *id)
{
490
	return git_oidmap_exists(idx->pack->idx_cache, id);
491 492
}

493
static int save_entry(git_indexer *idx, struct entry *entry, struct git_pack_entry *pentry, off64_t entry_start)
494
{
495
	int i;
496 497 498 499 500 501 502 503

	if (entry_start > UINT31_MAX) {
		entry->offset = UINT32_MAX;
		entry->offset_long = entry_start;
	} else {
		entry->offset = (uint32_t)entry_start;
	}

504
	pentry->offset = entry_start;
505

506 507
	if (git_oidmap_exists(idx->pack->idx_cache, &pentry->sha1) ||
	    git_oidmap_set(idx->pack->idx_cache, &pentry->sha1, pentry) < 0) {
508
		git_error_set(GIT_ERROR_INDEXER, "cannot insert object into pack");
509
		return -1;
510
	}
511 512 513 514 515 516 517 518 519 520 521 522

	/* Add the object to the list */
	if (git_vector_insert(&idx->objects, entry) < 0)
		return -1;

	for (i = entry->oid.id[0]; i < 256; ++i) {
		idx->fanout[i]++;
	}

	return 0;
}

523
static int hash_and_save(git_indexer *idx, git_rawobj *obj, off64_t entry_start)
524 525 526 527
{
	git_oid oid;
	size_t entry_size;
	struct entry *entry;
528
	struct git_pack_entry *pentry = NULL;
529 530

	entry = git__calloc(1, sizeof(*entry));
531
	GIT_ERROR_CHECK_ALLOC(entry);
532

533
	if (git_odb__hashobj(&oid, obj) < 0) {
534
		git_error_set(GIT_ERROR_INDEXER, "failed to hash object");
535
		goto on_error;
536 537
	}

Linquize committed
538
	pentry = git__calloc(1, sizeof(struct git_pack_entry));
539
	GIT_ERROR_CHECK_ALLOC(pentry);
540 541 542 543 544 545

	git_oid_cpy(&pentry->sha1, &oid);
	git_oid_cpy(&entry->oid, &oid);
	entry->crc = crc32(0L, Z_NULL, 0);

	entry_size = (size_t)(idx->off - entry_start);
546
	if (crc_object(&entry->crc, &idx->pack->mwf, entry_start, entry_size) < 0)
547 548
		goto on_error;

549
	return save_entry(idx, entry, pentry, entry_start);
550

551
on_error:
552
	git__free(pentry);
553 554
	git__free(entry);
	git__free(obj->data);
555 556
	return -1;
}
557

558
static int do_progress_callback(git_indexer *idx, git_indexer_progress *stats)
559
{
560
	if (idx->progress_cb)
561
		return git_error_set_after_callback_function(
562 563
			idx->progress_cb(stats, idx->progress_payload),
			"indexer progress");
564
	return 0;
565 566
}

567
/* Hash everything but the last 20B of input */
568
static void hash_partially(git_indexer *idx, const uint8_t *data, size_t size)
569
{
570
	size_t to_expell, to_keep;
571 572 573 574 575

	if (size == 0)
		return;

	/* Easy case, dump the buffer and the data minus the last 20 bytes */
576
	if (size >= GIT_OID_RAWSZ) {
577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593
		git_hash_update(&idx->trailer, idx->inbuf, idx->inbuf_len);
		git_hash_update(&idx->trailer, data, size - GIT_OID_RAWSZ);

		data += size - GIT_OID_RAWSZ;
		memcpy(idx->inbuf, data, GIT_OID_RAWSZ);
		idx->inbuf_len = GIT_OID_RAWSZ;
		return;
	}

	/* We can just append */
	if (idx->inbuf_len + size <= GIT_OID_RAWSZ) {
		memcpy(idx->inbuf + idx->inbuf_len, data, size);
		idx->inbuf_len += size;
		return;
	}

	/* We need to partially drain the buffer and then append */
594 595
	to_keep   = GIT_OID_RAWSZ - size;
	to_expell = idx->inbuf_len - to_keep;
596 597 598 599 600 601 602 603

	git_hash_update(&idx->trailer, idx->inbuf, to_expell);

	memmove(idx->inbuf, idx->inbuf + to_expell, to_keep);
	memcpy(idx->inbuf + to_keep, data, size);
	idx->inbuf_len += size - to_expell;
}

604
static int write_at(git_indexer *idx, const void *data, off64_t offset, size_t size)
605
{
606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622
#ifdef NO_MMAP
	size_t remaining_size = size;
	const char *ptr = (const char *)data;

	/* Handle data size larger that ssize_t */
	while (remaining_size > 0) {
		ssize_t nb;
		HANDLE_EINTR(nb, p_pwrite(idx->pack->mwf.fd, (void *)ptr,
					  remaining_size, offset));
		if (nb <= 0)
			return -1;

		ptr += nb;
		offset += nb;
		remaining_size -= nb;
	}
#else
623
	git_file fd = idx->pack->mwf.fd;
624
	size_t mmap_alignment;
625
	size_t page_offset;
626
	off64_t page_start;
627
	unsigned char *map_data;
628 629 630
	git_map map;
	int error;

631 632
	GIT_ASSERT_ARG(data);
	GIT_ASSERT_ARG(size);
633

634
	if ((error = git__mmap_alignment(&mmap_alignment)) < 0)
635 636
		return error;

637 638
	/* the offset needs to be at the mmap boundary for the platform */
	page_offset = offset % mmap_alignment;
639
	page_start = offset - page_offset;
640 641 642 643

	if ((error = p_mmap(&map, page_offset + size, GIT_PROT_WRITE, GIT_MAP_SHARED, fd, page_start)) < 0)
		return error;

644 645
	map_data = (unsigned char *)map.data;
	memcpy(map_data + page_offset, data, size);
646
	p_munmap(&map);
647
#endif
648 649 650 651 652 653

	return 0;
}

static int append_to_pack(git_indexer *idx, const void *data, size_t size)
{
654
	off64_t new_size;
655 656
	size_t mmap_alignment;
	size_t page_offset;
657 658
	off64_t page_start;
	off64_t current_size = idx->pack->mwf.size;
659
	int fd = idx->pack->mwf.fd;
660
	int error;
661

662 663 664
	if (!size)
		return 0;

665 666 667 668 669 670 671 672 673 674 675 676
	if ((error = git__mmap_alignment(&mmap_alignment)) < 0)
		return error;

	/* Write a single byte to force the file system to allocate space now or
	 * report an error, since we can't report errors when writing using mmap.
	 * Round the size up to the nearest page so that we only need to perform file
	 * I/O when we add a page, instead of whenever we write even a single byte. */
	new_size = current_size + size;
	page_offset = new_size % mmap_alignment;
	page_start = new_size - page_offset;

	if (p_lseek(fd, page_start + mmap_alignment - 1, SEEK_SET) < 0 ||
677
	    p_write(idx->pack->mwf.fd, data, 1) < 0) {
678
		git_error_set(GIT_ERROR_OS, "cannot extend packfile '%s'", idx->pack->pack_name);
679 680 681 682 683 684
		return -1;
	}

	return write_at(idx, data, idx->pack->mwf.size, size);
}

685
static int read_stream_object(git_indexer *idx, git_indexer_progress *stats)
686 687
{
	git_packfile_stream *stream = &idx->stream;
688
	off64_t entry_start = idx->off;
689
	size_t entry_size;
690
	git_object_t type;
691 692 693 694 695 696 697
	git_mwindow *w = NULL;
	int error;

	if (idx->pack->mwf.size <= idx->off + 20)
		return GIT_EBUFS;

	if (!idx->have_stream) {
698
		error = git_packfile_unpack_header(&entry_size, &type, idx->pack, &w, &idx->off);
699 700 701 702 703 704 705 706 707 708
		if (error == GIT_EBUFS) {
			idx->off = entry_start;
			return error;
		}
		if (error < 0)
			return error;

		git_mwindow_close(&w);
		idx->entry_start = entry_start;
		git_hash_init(&idx->hash_ctx);
709
		git_buf_clear(&idx->entry_data);
710

711
		if (type == GIT_OBJECT_REF_DELTA || type == GIT_OBJECT_OFS_DELTA) {
712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729
			error = advance_delta_offset(idx, type);
			if (error == GIT_EBUFS) {
				idx->off = entry_start;
				return error;
			}
			if (error < 0)
				return error;

			idx->have_delta = 1;
		} else {
			idx->have_delta = 0;

			error = hash_header(&idx->hash_ctx, entry_size, type);
			if (error < 0)
				return error;
		}

		idx->have_stream = 1;
730
		idx->entry_type = type;
731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773

		error = git_packfile_stream_open(stream, idx->pack, idx->off);
		if (error < 0)
			return error;
	}

	if (idx->have_delta) {
		error = read_object_stream(idx, stream);
	} else {
		error = hash_object_stream(idx, stream);
	}

	idx->off = stream->curpos;
	if (error == GIT_EBUFS)
		return error;

	/* We want to free the stream reasorces no matter what here */
	idx->have_stream = 0;
	git_packfile_stream_dispose(stream);

	if (error < 0)
		return error;

	if (idx->have_delta) {
		error = store_delta(idx);
	} else {
		error = store_object(idx);
	}

	if (error < 0)
		return error;

	if (!idx->have_delta) {
		stats->indexed_objects++;
	}
	stats->received_objects++;

	if ((error = do_progress_callback(idx, stats)) != 0)
		return error;

	return 0;
}

774
int git_indexer_append(git_indexer *idx, const void *data, size_t size, git_indexer_progress *stats)
775
{
776
	int error = -1;
777
	struct git_pack_header *hdr = &idx->hdr;
778
	git_mwindow_file *mwf = &idx->pack->mwf;
779

780 781 782
	GIT_ASSERT_ARG(idx);
	GIT_ASSERT_ARG(data);
	GIT_ASSERT_ARG(stats);
783

784
	if ((error = append_to_pack(idx, data, size)) < 0)
785
		return error;
786

nulltoken committed
787
	hash_partially(idx, data, (int)size);
788

789
	/* Make sure we set the new size of the pack */
790
	idx->pack->mwf.size += size;
791 792

	if (!idx->parsed_header) {
793 794
		unsigned int total_objects;

795
		if ((unsigned)idx->pack->mwf.size < sizeof(struct git_pack_header))
796 797
			return 0;

798 799
		if ((error = parse_header(&idx->hdr, idx->pack)) < 0)
			return error;
800 801

		idx->parsed_header = 1;
802
		idx->nr_objects = ntohl(hdr->hdr_entries);
803 804
		idx->off = sizeof(struct git_pack_header);

805
		if (idx->nr_objects <= git_indexer__max_objects) {
806
			total_objects = (unsigned int)idx->nr_objects;
807
		} else {
808
			git_error_set(GIT_ERROR_INDEXER, "too many objects");
809
			return -1;
810
		}
811

812 813
		if (git_oidmap_new(&idx->pack->idx_cache) < 0)
			return -1;
814 815

		idx->pack->has_cache = 1;
816
		if (git_vector_init(&idx->objects, total_objects, objects_cmp) < 0)
817 818
			return -1;

819
		if (git_vector_init(&idx->deltas, total_objects / 2, NULL) < 0)
820 821
			return -1;

822
		stats->received_objects = 0;
823
		stats->local_objects = 0;
824 825
		stats->total_deltas = 0;
		stats->indexed_deltas = 0;
826
		stats->indexed_objects = 0;
827
		stats->total_objects = total_objects;
828

829
		if ((error = do_progress_callback(idx, stats)) != 0)
830
			return error;
831 832 833 834 835
	}

	/* Now that we have data in the pack, let's try to parse it */

	/* As the file grows any windows we try to use will be out of date */
836 837
	if ((error = git_mwindow_free_all(mwf)) < 0)
		goto on_error;
838

839
	while (stats->indexed_objects < idx->nr_objects) {
840 841 842 843
		if ((error = read_stream_object(idx, stats)) != 0) {
			if (error == GIT_EBUFS)
				break;
			else
844
				goto on_error;
845
		}
846
	}
847

848
	return 0;
849

850 851
on_error:
	git_mwindow_free_all(mwf);
852
	return error;
853
}
854

855
static int index_path(git_buf *path, git_indexer *idx, const char *suffix)
856 857 858
{
	const char prefix[] = "pack-";
	size_t slash = (size_t)path->size;
859

860 861 862
	/* search backwards for '/' */
	while (slash > 0 && path->ptr[slash - 1] != '/')
		slash--;
863

864 865 866 867 868 869
	if (git_buf_grow(path, slash + 1 + strlen(prefix) +
					 GIT_OID_HEXSZ + strlen(suffix) + 1) < 0)
		return -1;

	git_buf_truncate(path, slash);
	git_buf_puts(path, prefix);
nulltoken committed
870
	git_oid_fmt(path->ptr + git_buf_len(path), &idx->hash);
871 872 873 874 875 876
	path->size += GIT_OID_HEXSZ;
	git_buf_puts(path, suffix);

	return git_buf_oom(path) ? -1 : 0;
}

877 878 879 880
/**
 * Rewind the packfile by the trailer, as we might need to fix the
 * packfile by injecting objects at the tail and must overwrite it.
 */
881
static int seek_back_trailer(git_indexer *idx)
882 883
{
	idx->pack->mwf.size -= GIT_OID_RAWSZ;
884
	return git_mwindow_free_all(&idx->pack->mwf);
885 886
}

887
static int inject_object(git_indexer *idx, git_oid *id)
888
{
889 890
	git_odb_object *obj = NULL;
	struct entry *entry = NULL;
891
	struct git_pack_entry *pentry = NULL;
892 893 894
	git_oid foo = {{0}};
	unsigned char hdr[64];
	git_buf buf = GIT_BUF_INIT;
895
	off64_t entry_start;
896 897 898 899
	const void *data;
	size_t len, hdr_len;
	int error;

900 901 902
	if ((error = seek_back_trailer(idx)) < 0)
		goto cleanup;

903
	entry_start = idx->pack->mwf.size;
904

905
	if ((error = git_odb_read(&obj, idx->odb, id)) < 0) {
906
		git_error_set(GIT_ERROR_INDEXER, "missing delta bases");
907
		goto cleanup;
908
	}
909 910 911 912

	data = git_odb_object_data(obj);
	len = git_odb_object_size(obj);

913
	entry = git__calloc(1, sizeof(*entry));
914
	GIT_ERROR_CHECK_ALLOC(entry);
915

916 917 918
	entry->crc = crc32(0L, Z_NULL, 0);

	/* Write out the object header */
Edward Thomson committed
919 920
	if ((error = git_packfile__object_header(&hdr_len, hdr, len, git_odb_object_type(obj))) < 0 ||
	    (error = append_to_pack(idx, hdr, hdr_len)) < 0)
921 922
		goto cleanup;

923
	idx->pack->mwf.size += hdr_len;
924
	entry->crc = crc32(entry->crc, hdr, (uInt)hdr_len);
925

926
	if ((error = git_zstream_deflatebuf(&buf, data, len)) < 0)
927 928 929
		goto cleanup;

	/* And then the compressed object */
930 931 932
	if ((error = append_to_pack(idx, buf.ptr, buf.size)) < 0)
		goto cleanup;

933
	idx->pack->mwf.size += buf.size;
Linquize committed
934
	entry->crc = htonl(crc32(entry->crc, (unsigned char *)buf.ptr, (uInt)buf.size));
935
	git_buf_dispose(&buf);
936 937

	/* Write a fake trailer so the pack functions play ball */
938 939

	if ((error = append_to_pack(idx, &foo, GIT_OID_RAWSZ)) < 0)
940 941 942 943 944
		goto cleanup;

	idx->pack->mwf.size += GIT_OID_RAWSZ;

	pentry = git__calloc(1, sizeof(struct git_pack_entry));
945
	GIT_ERROR_CHECK_ALLOC(pentry);
946 947 948 949 950

	git_oid_cpy(&pentry->sha1, id);
	git_oid_cpy(&entry->oid, id);
	idx->off = entry_start + hdr_len + len;

951
	error = save_entry(idx, entry, pentry, entry_start);
952 953

cleanup:
954 955 956 957
	if (error) {
		git__free(entry);
		git__free(pentry);
	}
958

959 960 961 962
	git_odb_object_free(obj);
	return error;
}

963
static int fix_thin_pack(git_indexer *idx, git_indexer_progress *stats)
964
{
965
	int error, found_ref_delta = 0;
966 967
	unsigned int i;
	struct delta_info *delta;
968
	size_t size;
969
	git_object_t type;
970
	git_mwindow *w = NULL;
971
	off64_t curpos = 0;
972 973 974 975
	unsigned char *base_info;
	unsigned int left = 0;
	git_oid base;

976
	GIT_ASSERT(git_vector_length(&idx->deltas) > 0);
977 978

	if (idx->odb == NULL) {
979
		git_error_set(GIT_ERROR_INDEXER, "cannot fix a thin pack without an ODB");
980 981
		return -1;
	}
982

983
	/* Loop until we find the first REF delta */
984
	git_vector_foreach(&idx->deltas, i, delta) {
985 986 987
		if (!delta)
			continue;

988
		curpos = delta->delta_off;
989
		error = git_packfile_unpack_header(&size, &type, idx->pack, &w, &curpos);
990 991 992
		if (error < 0)
			return error;

993
		if (type == GIT_OBJECT_REF_DELTA) {
994 995
			found_ref_delta = 1;
			break;
996
		}
997
	}
998

999
	if (!found_ref_delta) {
1000
		git_error_set(GIT_ERROR_INDEXER, "no REF_DELTA found, cannot inject object");
1001 1002
		return -1;
	}
1003

1004 1005 1006
	/* curpos now points to the base information, which is an OID */
	base_info = git_mwindow_open(&idx->pack->mwf, &w, curpos, GIT_OID_RAWSZ, &left);
	if (base_info == NULL) {
1007
		git_error_set(GIT_ERROR_INDEXER, "failed to map delta information");
1008 1009
		return -1;
	}
1010

1011 1012
	git_oid_fromraw(&base, base_info);
	git_mwindow_close(&w);
1013

1014 1015 1016
	if (has_entry(idx, &base))
		return 0;

1017 1018 1019 1020
	if (inject_object(idx, &base) < 0)
		return -1;

	stats->local_objects++;
1021 1022 1023 1024

	return 0;
}

1025
static int resolve_deltas(git_indexer *idx, git_indexer_progress *stats)
1026 1027
{
	unsigned int i;
lhchavez committed
1028
	int error;
1029
	struct delta_info *delta;
1030
	int progressed = 0, non_null = 0, progress_cb_result;
1031 1032 1033

	while (idx->deltas.length > 0) {
		progressed = 0;
1034
		non_null = 0;
1035
		git_vector_foreach(&idx->deltas, i, delta) {
1036
			git_rawobj obj = {0};
1037

1038 1039 1040 1041
			if (!delta)
				continue;

			non_null = 1;
1042
			idx->off = delta->delta_off;
lhchavez committed
1043 1044 1045 1046 1047 1048 1049
			if ((error = git_packfile_unpack(&obj, idx->pack, &idx->off)) < 0) {
				if (error == GIT_PASSTHROUGH) {
					/* We have not seen the base object, we'll try again later. */
					continue;
				}
				return -1;
			}
1050

1051 1052 1053 1054
			if (idx->do_verify && check_object_connectivity(idx, &obj) < 0)
				/* TODO: error? continue? */
				continue;

1055 1056 1057 1058 1059
			if (hash_and_save(idx, &obj, delta->delta_off) < 0)
				continue;

			git__free(obj.data);
			stats->indexed_objects++;
1060
			stats->indexed_deltas++;
1061
			progressed = 1;
1062 1063
			if ((progress_cb_result = do_progress_callback(idx, stats)) < 0)
				return progress_cb_result;
1064

1065 1066
			/* remove from the list */
			git_vector_set(NULL, &idx->deltas, i, NULL);
1067
			git__free(delta);
1068
		}
1069

1070 1071 1072 1073
		/* if none were actually set, we're done */
		if (!non_null)
			break;

1074
		if (!progressed && (fix_thin_pack(idx, stats) < 0)) {
1075
			return -1;
1076
		}
1077 1078 1079 1080 1081
	}

	return 0;
}

1082
static int update_header_and_rehash(git_indexer *idx, git_indexer_progress *stats)
1083 1084 1085
{
	void *ptr;
	size_t chunk = 1024*1024;
1086
	off64_t hashed = 0;
1087 1088 1089 1090 1091 1092
	git_mwindow *w = NULL;
	git_mwindow_file *mwf;
	unsigned int left;

	mwf = &idx->pack->mwf;

1093
	git_hash_init(&idx->trailer);
1094

1095 1096

	/* Update the header to include the numer of local objects we injected */
1097
	idx->hdr.hdr_entries = htonl(stats->total_objects + stats->local_objects);
1098
	if (write_at(idx, &idx->hdr, 0, sizeof(struct git_pack_header)) < 0)
1099
		return -1;
1100

1101 1102 1103 1104 1105 1106
	/*
	 * We now use the same technique as before to determine the
	 * hash. We keep reading up to the end and let
	 * hash_partially() keep the existing trailer out of the
	 * calculation.
	 */
1107 1108 1109
	if (git_mwindow_free_all(mwf) < 0)
		return -1;

1110 1111 1112 1113
	idx->inbuf_len = 0;
	while (hashed < mwf->size) {
		ptr = git_mwindow_open(mwf, &w, hashed, chunk, &left);
		if (ptr == NULL)
1114
			return -1;
1115

1116 1117 1118 1119
		hash_partially(idx, ptr, left);
		hashed += left;

		git_mwindow_close(&w);
1120
	}
1121

1122 1123 1124
	return 0;
}

1125
int git_indexer_commit(git_indexer *idx, git_indexer_progress *stats)
1126 1127 1128
{
	git_mwindow *w = NULL;
	unsigned int i, long_offsets = 0, left;
1129
	int error;
1130 1131 1132
	struct git_pack_idx_header hdr;
	git_buf filename = GIT_BUF_INIT;
	struct entry *entry;
1133
	git_oid trailer_hash, file_hash;
1134
	git_filebuf index_file = {0};
1135
	void *packfile_trailer;
1136

1137
	if (!idx->parsed_header) {
1138
		git_error_set(GIT_ERROR_INDEXER, "incomplete pack header");
1139 1140 1141
		return -1;
	}

1142
	/* Test for this before resolve_deltas(), as it plays with idx->off */
1143
	if (idx->off + 20 < idx->pack->mwf.size) {
1144
		git_error_set(GIT_ERROR_INDEXER, "unexpected data at the end of the pack");
1145 1146
		return -1;
	}
1147
	if (idx->off + 20 > idx->pack->mwf.size) {
1148
		git_error_set(GIT_ERROR_INDEXER, "missing trailer at the end of the pack");
1149 1150
		return -1;
	}
1151

1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163
	packfile_trailer = git_mwindow_open(&idx->pack->mwf, &w, idx->pack->mwf.size - GIT_OID_RAWSZ, GIT_OID_RAWSZ, &left);
	if (packfile_trailer == NULL) {
		git_mwindow_close(&w);
		goto on_error;
	}

	/* Compare the packfile trailer as it was sent to us and what we calculated */
	git_oid_fromraw(&file_hash, packfile_trailer);
	git_mwindow_close(&w);

	git_hash_final(&trailer_hash, &idx->trailer);
	if (git_oid_cmp(&file_hash, &trailer_hash)) {
1164
		git_error_set(GIT_ERROR_INDEXER, "packfile trailer mismatch");
1165 1166 1167
		return -1;
	}

1168 1169 1170
	/* Freeze the number of deltas */
	stats->total_deltas = stats->total_objects - stats->indexed_objects;

1171 1172
	if ((error = resolve_deltas(idx, stats)) < 0)
		return error;
1173

1174
	if (stats->indexed_objects != stats->total_objects) {
1175
		git_error_set(GIT_ERROR_INDEXER, "early EOF");
1176 1177 1178
		return -1;
	}

1179 1180 1181 1182 1183
	if (stats->local_objects > 0) {
		if (update_header_and_rehash(idx, stats) < 0)
			return -1;

		git_hash_final(&trailer_hash, &idx->trailer);
1184
		write_at(idx, &trailer_hash, idx->pack->mwf.size - GIT_OID_RAWSZ, GIT_OID_RAWSZ);
1185 1186
	}

1187 1188 1189 1190 1191 1192 1193
	/*
	 * Is the resulting graph fully connected or are we still
	 * missing some objects? In the second case, we can
	 * bail out due to an incomplete and thus corrupt
	 * packfile.
	 */
	if (git_oidmap_size(idx->expected_oids) > 0) {
1194
		git_error_set(GIT_ERROR_INDEXER, "packfile is missing %"PRIuZ" objects",
1195 1196 1197 1198
			git_oidmap_size(idx->expected_oids));
		return -1;
	}

1199 1200
	git_vector_sort(&idx->objects);

1201 1202 1203 1204
	/* Use the trailer hash as the pack file name to ensure
	 * files with different contents have different names */
	git_oid_cpy(&idx->hash, &trailer_hash);

1205
	git_buf_sets(&filename, idx->pack->pack_name);
1206
	git_buf_shorten(&filename, strlen("pack"));
1207 1208 1209 1210
	git_buf_puts(&filename, "idx");
	if (git_buf_oom(&filename))
		return -1;

1211
	if (git_filebuf_open(&index_file, filename.ptr,
1212
		GIT_FILEBUF_HASH_CONTENTS |
1213
		(idx->do_fsync ? GIT_FILEBUF_FSYNC : 0),
1214
		idx->mode) < 0)
1215 1216 1217 1218 1219
		goto on_error;

	/* Write out the header */
	hdr.idx_signature = htonl(PACK_IDX_SIGNATURE);
	hdr.idx_version = htonl(2);
1220
	git_filebuf_write(&index_file, &hdr, sizeof(hdr));
1221 1222 1223 1224

	/* Write out the fanout table */
	for (i = 0; i < 256; ++i) {
		uint32_t n = htonl(idx->fanout[i]);
1225
		git_filebuf_write(&index_file, &n, sizeof(n));
1226 1227
	}

1228 1229
	/* Write out the object names (SHA-1 hashes) */
	git_vector_foreach(&idx->objects, i, entry) {
1230
		git_filebuf_write(&index_file, &entry->oid, sizeof(git_oid));
1231 1232 1233 1234
	}

	/* Write out the CRC32 values */
	git_vector_foreach(&idx->objects, i, entry) {
1235
		git_filebuf_write(&index_file, &entry->crc, sizeof(uint32_t));
1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246
	}

	/* Write out the offsets */
	git_vector_foreach(&idx->objects, i, entry) {
		uint32_t n;

		if (entry->offset == UINT32_MAX)
			n = htonl(0x80000000 | long_offsets++);
		else
			n = htonl(entry->offset);

1247
		git_filebuf_write(&index_file, &n, sizeof(uint32_t));
1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259
	}

	/* Write out the long offsets */
	git_vector_foreach(&idx->objects, i, entry) {
		uint32_t split[2];

		if (entry->offset != UINT32_MAX)
			continue;

		split[0] = htonl(entry->offset_long >> 32);
		split[1] = htonl(entry->offset_long & 0xffffffff);

1260
		git_filebuf_write(&index_file, &split, sizeof(uint32_t) * 2);
1261 1262
	}

1263 1264
	/* Write out the packfile trailer to the index */
	if (git_filebuf_write(&index_file, &trailer_hash, GIT_OID_RAWSZ) < 0)
1265 1266
		goto on_error;

1267 1268
	/* Write out the hash of the idx */
	if (git_filebuf_hash(&trailer_hash, &index_file) < 0)
1269 1270
		goto on_error;

1271
	git_filebuf_write(&index_file, &trailer_hash, sizeof(git_oid));
1272 1273

	/* Figure out what the final name should be */
1274
	if (index_path(&filename, idx, ".idx") < 0)
1275 1276 1277
		goto on_error;

	/* Commit file */
1278
	if (git_filebuf_commit_at(&index_file, filename.ptr) < 0)
1279 1280
		goto on_error;

1281 1282
	if (git_mwindow_free_all(&idx->pack->mwf) < 0)
		goto on_error;
1283 1284 1285

	/* Truncate file to undo rounding up to next page_size in append_to_pack */
	if (p_ftruncate(idx->pack->mwf.fd, idx->pack->mwf.size) < 0) {
1286
		git_error_set(GIT_ERROR_OS, "failed to truncate pack file '%s'", idx->pack->pack_name);
1287 1288 1289
		return -1;
	}

1290
	if (idx->do_fsync && p_fsync(idx->pack->mwf.fd) < 0) {
1291
		git_error_set(GIT_ERROR_OS, "failed to fsync packfile");
1292 1293 1294
		goto on_error;
	}

1295
	/* We need to close the descriptor here so Windows doesn't choke on commit_at */
1296
	if (p_close(idx->pack->mwf.fd) < 0) {
1297
		git_error_set(GIT_ERROR_OS, "failed to close packfile");
1298 1299 1300
		goto on_error;
	}

1301
	idx->pack->mwf.fd = -1;
1302

1303
	if (index_path(&filename, idx, ".pack") < 0)
1304
		goto on_error;
1305

1306
	/* And don't forget to rename the packfile to its new place. */
1307 1308 1309 1310
	if (p_rename(idx->pack->pack_name, git_buf_cstr(&filename)) < 0)
		goto on_error;

	/* And fsync the parent directory if we're asked to. */
1311
	if (idx->do_fsync &&
1312 1313 1314
		git_futils_fsync_parent(git_buf_cstr(&filename)) < 0)
		goto on_error;

1315
	idx->pack_committed = 1;
1316

1317
	git_buf_dispose(&filename);
1318 1319 1320
	return 0;

on_error:
1321
	git_mwindow_free_all(&idx->pack->mwf);
1322
	git_filebuf_cleanup(&index_file);
1323
	git_buf_dispose(&filename);
1324 1325 1326
	return -1;
}

1327
void git_indexer_free(git_indexer *idx)
1328
{
1329 1330 1331
	const git_oid *key;
	git_oid *value;
	size_t iter;
1332

1333 1334 1335
	if (idx == NULL)
		return;

1336
	if (idx->have_stream)
1337
		git_packfile_stream_dispose(&idx->stream);
1338

1339
	git_vector_free_deep(&idx->objects);
1340

1341
	if (idx->pack->idx_cache) {
Russell Belfer committed
1342
		struct git_pack_entry *pentry;
1343 1344 1345
		git_oidmap_foreach_value(idx->pack->idx_cache, pentry, {
			git__free(pentry);
		});
1346 1347

		git_oidmap_free(idx->pack->idx_cache);
1348
	}
1349

1350
	git_vector_free_deep(&idx->deltas);
1351

1352
	git_packfile_free(idx->pack, !idx->pack_committed);
1353

1354 1355 1356
	iter = 0;
	while (git_oidmap_iterate((void **) &value, idx->expected_oids, &iter, &key) == 0)
		git__free(value);
1357

1358 1359
	git_hash_ctx_cleanup(&idx->trailer);
	git_hash_ctx_cleanup(&idx->hash_ctx);
1360 1361
	git_buf_dispose(&idx->entry_data);
	git_oidmap_free(idx->expected_oids);
1362 1363
	git__free(idx);
}