indexer.c 31.7 KB
Newer Older
1
/*
Edward Thomson committed
2
 * Copyright (C) the libgit2 contributors. All rights reserved.
3
 *
Vicent Marti committed
4 5
 * This file is part of libgit2, distributed under the GNU GPL v2 with
 * a Linking Exception. For full terms see the included COPYING file.
6 7
 */

8 9
#include "indexer.h"

Carlos Martín Nieto committed
10
#include "git2/indexer.h"
11
#include "git2/object.h"
Carlos Martín Nieto committed
12

13 14 15
#include "commit.h"
#include "tree.h"
#include "tag.h"
16
#include "pack.h"
Carlos Martín Nieto committed
17
#include "mwindow.h"
18
#include "posix.h"
19 20
#include "pack.h"
#include "filebuf.h"
21
#include "oid.h"
22
#include "oidarray.h"
23
#include "oidmap.h"
24
#include "zstream.h"
25
#include "object.h"
26

27
size_t git_indexer__max_objects = UINT32_MAX;
28

29
#define UINT31_MAX (0x7FFFFFFF)
30

31
struct entry {
32
	git_oid oid;
33 34 35 36 37
	uint32_t crc;
	uint32_t offset;
	uint64_t offset_long;
};

38
struct git_indexer {
39
	unsigned int parsed_header :1,
40
		pack_committed :1,
41
		have_stream :1,
42
		have_delta :1,
43 44
		do_fsync :1,
		do_verify :1;
45
	struct git_pack_header hdr;
46
	struct git_pack_file *pack;
47
	unsigned int mode;
48 49
	off64_t off;
	off64_t entry_start;
50
	git_object_t entry_type;
51
	git_buf entry_data;
52
	git_packfile_stream stream;
53 54 55 56
	size_t nr_objects;
	git_vector objects;
	git_vector deltas;
	unsigned int fanout[256];
57
	git_hash_ctx hash_ctx;
58
	git_oid hash;
59
	git_indexer_progress_cb progress_cb;
60
	void *progress_payload;
61
	char objbuf[8*1024];
62

63 64 65
	/* OIDs referenced from pack objects. Used for verification. */
	git_oidmap *expected_oids;

66 67 68
	/* Needed to look up objects which we want to inject to fix a thin pack */
	git_odb *odb;

69 70
	/* Fields for calculating the packfile trailer (hash of everything before it) */
	char inbuf[GIT_OID_RAWSZ];
71
	size_t inbuf_len;
72
	git_hash_ctx trailer;
73 74 75
};

struct delta_info {
76
	off64_t delta_off;
77 78
};

79
const git_oid *git_indexer_hash(const git_indexer *idx)
80 81 82 83
{
	return &idx->hash;
}

84
static int parse_header(struct git_pack_header *hdr, struct git_pack_file *pack)
85 86
{
	int error;
87
	git_map map;
88

89
	if ((error = p_mmap(&map, sizeof(*hdr), GIT_PROT_READ, GIT_MAP_SHARED, pack->mwf.fd, 0)) < 0)
90
		return error;
91

92 93 94 95
	memcpy(hdr, map.data, sizeof(*hdr));
	p_munmap(&map);

	/* Verify we recognize this pack file format. */
96
	if (hdr->hdr_signature != ntohl(PACK_SIGNATURE)) {
97
		git_error_set(GIT_ERROR_INDEXER, "wrong pack signature");
98 99
		return -1;
	}
100

101
	if (!pack_version_ok(hdr->hdr_version)) {
102
		git_error_set(GIT_ERROR_INDEXER, "wrong pack version");
103 104
		return -1;
	}
Carlos Martín Nieto committed
105

106
	return 0;
107 108
}

109
static int objects_cmp(const void *a, const void *b)
110 111 112 113
{
	const struct entry *entrya = a;
	const struct entry *entryb = b;

114
	return git_oid__cmp(&entrya->oid, &entryb->oid);
115 116
}

117
int git_indexer_options_init(git_indexer_options *opts, unsigned int version)
118 119 120 121 122 123
{
	GIT_INIT_STRUCTURE_FROM_TEMPLATE(
		opts, version, git_indexer_options, GIT_INDEXER_OPTIONS_INIT);
	return 0;
}

124
#ifndef GIT_DEPRECATE_HARD
125 126 127 128
int git_indexer_init_options(git_indexer_options *opts, unsigned int version)
{
	return git_indexer_options_init(opts, version);
}
129
#endif
130

131 132
int git_indexer_new(
		git_indexer **out,
133
		const char *prefix,
134
		unsigned int mode,
135
		git_odb *odb,
136
		git_indexer_options *in_opts)
137
{
138
	git_indexer_options opts = GIT_INDEXER_OPTIONS_INIT;
139
	git_indexer *idx;
140
	git_buf path = GIT_BUF_INIT, tmp_path = GIT_BUF_INIT;
141
	static const char suff[] = "/pack";
142
	int error, fd = -1;
143

144 145 146
	if (in_opts)
		memcpy(&opts, in_opts, sizeof(opts));

147
	idx = git__calloc(1, sizeof(git_indexer));
148
	GIT_ERROR_CHECK_ALLOC(idx);
149
	idx->odb = odb;
150 151
	idx->progress_cb = opts.progress_cb;
	idx->progress_payload = opts.progress_cb_payload;
152
	idx->mode = mode ? mode : GIT_PACK_FILE_MODE;
153
	git_buf_init(&idx->entry_data, 0);
154

155 156 157
	if ((error = git_hash_ctx_init(&idx->hash_ctx)) < 0 ||
	    (error = git_hash_ctx_init(&idx->trailer)) < 0 ||
	    (error = git_oidmap_new(&idx->expected_oids)) < 0)
158
		goto cleanup;
159

160
	idx->do_verify = opts.verify;
161

162
	if (git_repository__fsync_gitdir)
163 164
		idx->do_fsync = 1;

165 166 167 168
	error = git_buf_joinpath(&path, prefix, suff);
	if (error < 0)
		goto cleanup;

169
	fd = git_futils_mktmp(&tmp_path, git_buf_cstr(&path), idx->mode);
170
	git_buf_dispose(&path);
171 172 173 174
	if (fd < 0)
		goto cleanup;

	error = git_packfile_alloc(&idx->pack, git_buf_cstr(&tmp_path));
175
	git_buf_dispose(&tmp_path);
176

177 178 179
	if (error < 0)
		goto cleanup;

180 181 182 183
	idx->pack->mwf.fd = fd;
	if ((error = git_mwindow_file_register(&idx->pack->mwf)) < 0)
		goto cleanup;

184 185 186 187
	*out = idx;
	return 0;

cleanup:
188 189 190
	if (fd != -1)
		p_close(fd);

lhchavez committed
191 192
	if (git_buf_len(&tmp_path) > 0)
		p_unlink(git_buf_cstr(&tmp_path));
193 194

	if (idx->pack != NULL)
lhchavez committed
195
		p_unlink(idx->pack->pack_name);
196

197 198
	git_buf_dispose(&path);
	git_buf_dispose(&tmp_path);
199 200 201 202
	git__free(idx);
	return -1;
}

203 204 205 206 207
void git_indexer__set_fsync(git_indexer *idx, int do_fsync)
{
	idx->do_fsync = !!do_fsync;
}

208
/* Try to store the delta so we can try to resolve it later */
209
static int store_delta(git_indexer *idx)
210
{
211 212
	struct delta_info *delta;

213
	delta = git__calloc(1, sizeof(struct delta_info));
214
	GIT_ERROR_CHECK_ALLOC(delta);
215
	delta->delta_off = idx->entry_start;
216

217
	if (git_vector_insert(&idx->deltas, delta) < 0)
218 219 220 221 222
		return -1;

	return 0;
}

223
static int hash_header(git_hash_ctx *ctx, off64_t len, git_object_t type)
224 225 226
{
	char buffer[64];
	size_t hdrlen;
227 228 229 230 231
	int error;

	if ((error = git_odb__format_object_header(&hdrlen,
		buffer, sizeof(buffer), (size_t)len, type)) < 0)
		return error;
232

233
	return git_hash_update(ctx, buffer, hdrlen);
234 235
}

236
static int hash_object_stream(git_indexer*idx, git_packfile_stream *stream)
237 238 239
{
	ssize_t read;

240 241
	GIT_ASSERT_ARG(idx);
	GIT_ASSERT_ARG(stream);
242 243

	do {
244
		if ((read = git_packfile_stream_read(stream, idx->objbuf, sizeof(idx->objbuf))) < 0)
245 246
			break;

247 248 249
		if (idx->do_verify)
			git_buf_put(&idx->entry_data, idx->objbuf, read);

250
		git_hash_update(&idx->hash_ctx, idx->objbuf, read);
251 252 253 254 255 256 257 258
	} while (read > 0);

	if (read < 0)
		return (int)read;

	return 0;
}

259
/* In order to create the packfile stream, we need to skip over the delta base description */
260
static int advance_delta_offset(git_indexer *idx, git_object_t type)
261 262 263
{
	git_mwindow *w = NULL;

264
	GIT_ASSERT_ARG(type == GIT_OBJECT_REF_DELTA || type == GIT_OBJECT_OFS_DELTA);
265

266
	if (type == GIT_OBJECT_REF_DELTA) {
267 268
		idx->off += GIT_OID_RAWSZ;
	} else {
269 270
		off64_t base_off;
		int error = get_delta_base(&base_off, idx->pack, &w, &idx->off, type, idx->entry_start);
271
		git_mwindow_close(&w);
272 273
		if (error < 0)
			return error;
274 275 276 277 278 279
	}

	return 0;
}

/* Read from the stream and discard any output */
280
static int read_object_stream(git_indexer *idx, git_packfile_stream *stream)
281 282 283
{
	ssize_t read;

284
	GIT_ASSERT_ARG(stream);
285 286

	do {
287
		read = git_packfile_stream_read(stream, idx->objbuf, sizeof(idx->objbuf));
288 289 290 291 292 293 294 295
	} while (read > 0);

	if (read < 0)
		return (int)read;

	return 0;
}

296
static int crc_object(uint32_t *crc_out, git_mwindow_file *mwf, off64_t start, off64_t size)
297 298 299 300 301 302 303 304
{
	void *ptr;
	uint32_t crc;
	unsigned int left, len;
	git_mwindow *w = NULL;

	crc = crc32(0L, Z_NULL, 0);
	while (size) {
305
		ptr = git_mwindow_open(mwf, &w, start, (size_t)size, &left);
306 307 308
		if (ptr == NULL)
			return -1;

309
		len = min(left, (unsigned int)size);
310 311 312 313 314 315 316 317 318 319
		crc = crc32(crc, ptr, len);
		size -= len;
		start += len;
		git_mwindow_close(&w);
	}

	*crc_out = htonl(crc);
	return 0;
}

320
static int add_expected_oid(git_indexer *idx, const git_oid *oid)
321 322 323 324 325 326
{
	/*
	 * If we know about that object because it is stored in our ODB or
	 * because we have already processed it as part of our pack file, we do
	 * not have to expect it.
	 */
327
	if ((!idx->odb || !git_odb_exists(idx->odb, oid)) &&
328 329 330
	    !git_oidmap_exists(idx->pack->idx_cache, oid) &&
	    !git_oidmap_exists(idx->expected_oids, oid)) {
		    git_oid *dup = git__malloc(sizeof(*oid));
331
		    GIT_ERROR_CHECK_ALLOC(dup);
332
		    git_oid_cpy(dup, oid);
333
		    return git_oidmap_set(idx->expected_oids, dup, dup);
334
	}
335 336

	return 0;
337 338 339 340 341
}

static int check_object_connectivity(git_indexer *idx, const git_rawobj *obj)
{
	git_object *object;
342
	git_oid *expected;
343 344
	int error;

345 346 347 348
	if (obj->type != GIT_OBJECT_BLOB &&
	    obj->type != GIT_OBJECT_TREE &&
	    obj->type != GIT_OBJECT_COMMIT &&
	    obj->type != GIT_OBJECT_TAG)
349 350 351 352 353
		return 0;

	if ((error = git_object__from_raw(&object, obj->data, obj->len, obj->type)) < 0)
		goto out;

354 355 356
	if ((expected = git_oidmap_get(idx->expected_oids, &object->cached.oid)) != NULL) {
		git_oidmap_delete(idx->expected_oids, &object->cached.oid);
		git__free(expected);
357 358 359 360 361 362
	}

	/*
	 * Check whether this is a known object. If so, we can just continue as
	 * we assume that the ODB has a complete graph.
	 */
363
	if (idx->odb && git_odb_exists(idx->odb, &object->cached.oid))
364 365 366
		return 0;

	switch (obj->type) {
367
		case GIT_OBJECT_TREE:
368 369 370 371 372 373
		{
			git_tree *tree = (git_tree *) object;
			git_tree_entry *entry;
			size_t i;

			git_array_foreach(tree->entries, i, entry)
374 375
				if (add_expected_oid(idx, entry->oid) < 0)
					goto out;
376 377 378

			break;
		}
379
		case GIT_OBJECT_COMMIT:
380 381 382 383 384 385
		{
			git_commit *commit = (git_commit *) object;
			git_oid *parent_oid;
			size_t i;

			git_array_foreach(commit->parent_ids, i, parent_oid)
386 387
				if (add_expected_oid(idx, parent_oid) < 0)
					goto out;
388

389 390
			if (add_expected_oid(idx, &commit->tree_id) < 0)
				goto out;
391 392 393

			break;
		}
394
		case GIT_OBJECT_TAG:
395 396 397
		{
			git_tag *tag = (git_tag *) object;

398 399
			if (add_expected_oid(idx, &tag->target) < 0)
				goto out;
400 401 402

			break;
		}
403
		case GIT_OBJECT_BLOB:
404 405 406 407 408 409 410 411 412 413
		default:
			break;
	}

out:
	git_object_free(object);

	return error;
}

414
static int store_object(git_indexer *idx)
415
{
416
	int i, error;
417 418
	git_oid oid;
	struct entry *entry;
419
	off64_t entry_size;
420
	struct git_pack_entry *pentry;
421
	off64_t entry_start = idx->entry_start;
422 423

	entry = git__calloc(1, sizeof(*entry));
424
	GIT_ERROR_CHECK_ALLOC(entry);
425

Linquize committed
426
	pentry = git__calloc(1, sizeof(struct git_pack_entry));
427
	GIT_ERROR_CHECK_ALLOC(pentry);
428

429 430 431 432
	if (git_hash_final(&oid, &idx->hash_ctx)) {
		git__free(pentry);
		goto on_error;
	}
433 434 435 436 437 438 439 440
	entry_size = idx->off - entry_start;
	if (entry_start > UINT31_MAX) {
		entry->offset = UINT32_MAX;
		entry->offset_long = entry_start;
	} else {
		entry->offset = (uint32_t)entry_start;
	}

441 442 443 444 445 446 447 448 449 450 451
	if (idx->do_verify) {
		git_rawobj rawobj = {
		    idx->entry_data.ptr,
		    idx->entry_data.size,
		    idx->entry_type
		};

		if ((error = check_object_connectivity(idx, &rawobj)) < 0)
			goto on_error;
	}

452 453
	git_oid_cpy(&pentry->sha1, &oid);
	pentry->offset = entry_start;
454

455 456
	if (git_oidmap_exists(idx->pack->idx_cache, &pentry->sha1)) {
		git_error_set(GIT_ERROR_INDEXER, "duplicate object %s found in pack", git_oid_tostr_s(&pentry->sha1));
457
		git__free(pentry);
458
		goto on_error;
459
	}
460

461
	if ((error = git_oidmap_set(idx->pack->idx_cache, &pentry->sha1, pentry)) < 0) {
462
		git__free(pentry);
463
		git_error_set_oom();
464 465 466
		goto on_error;
	}

467 468
	git_oid_cpy(&entry->oid, &oid);

469
	if (crc_object(&entry->crc, &idx->pack->mwf, entry_start, entry_size) < 0)
470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487
		goto on_error;

	/* Add the object to the list */
	if (git_vector_insert(&idx->objects, entry) < 0)
		goto on_error;

	for (i = oid.id[0]; i < 256; ++i) {
		idx->fanout[i]++;
	}

	return 0;

on_error:
	git__free(entry);

	return -1;
}

488 489
GIT_INLINE(bool) has_entry(git_indexer *idx, git_oid *id)
{
490
	return git_oidmap_exists(idx->pack->idx_cache, id);
491 492
}

493
static int save_entry(git_indexer *idx, struct entry *entry, struct git_pack_entry *pentry, off64_t entry_start)
494
{
495
	int i;
496 497 498 499 500 501 502 503

	if (entry_start > UINT31_MAX) {
		entry->offset = UINT32_MAX;
		entry->offset_long = entry_start;
	} else {
		entry->offset = (uint32_t)entry_start;
	}

504
	pentry->offset = entry_start;
505

506 507
	if (git_oidmap_exists(idx->pack->idx_cache, &pentry->sha1) ||
	    git_oidmap_set(idx->pack->idx_cache, &pentry->sha1, pentry) < 0) {
508
		git_error_set(GIT_ERROR_INDEXER, "cannot insert object into pack");
509
		return -1;
510
	}
511 512 513 514 515 516 517 518 519 520 521 522

	/* Add the object to the list */
	if (git_vector_insert(&idx->objects, entry) < 0)
		return -1;

	for (i = entry->oid.id[0]; i < 256; ++i) {
		idx->fanout[i]++;
	}

	return 0;
}

523
static int hash_and_save(git_indexer *idx, git_rawobj *obj, off64_t entry_start)
524 525 526 527
{
	git_oid oid;
	size_t entry_size;
	struct entry *entry;
528
	struct git_pack_entry *pentry = NULL;
529 530

	entry = git__calloc(1, sizeof(*entry));
531
	GIT_ERROR_CHECK_ALLOC(entry);
532

533
	if (git_odb__hashobj(&oid, obj) < 0) {
534
		git_error_set(GIT_ERROR_INDEXER, "failed to hash object");
535
		goto on_error;
536 537
	}

Linquize committed
538
	pentry = git__calloc(1, sizeof(struct git_pack_entry));
539
	GIT_ERROR_CHECK_ALLOC(pentry);
540 541 542 543 544 545

	git_oid_cpy(&pentry->sha1, &oid);
	git_oid_cpy(&entry->oid, &oid);
	entry->crc = crc32(0L, Z_NULL, 0);

	entry_size = (size_t)(idx->off - entry_start);
546
	if (crc_object(&entry->crc, &idx->pack->mwf, entry_start, entry_size) < 0)
547 548
		goto on_error;

549
	return save_entry(idx, entry, pentry, entry_start);
550

551
on_error:
552
	git__free(pentry);
553 554
	git__free(entry);
	git__free(obj->data);
555 556
	return -1;
}
557

558
static int do_progress_callback(git_indexer *idx, git_indexer_progress *stats)
559
{
560
	if (idx->progress_cb)
561
		return git_error_set_after_callback_function(
562 563
			idx->progress_cb(stats, idx->progress_payload),
			"indexer progress");
564
	return 0;
565 566
}

567
/* Hash everything but the last 20B of input */
568
static void hash_partially(git_indexer *idx, const uint8_t *data, size_t size)
569
{
570
	size_t to_expell, to_keep;
571 572 573 574 575

	if (size == 0)
		return;

	/* Easy case, dump the buffer and the data minus the last 20 bytes */
576
	if (size >= GIT_OID_RAWSZ) {
577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593
		git_hash_update(&idx->trailer, idx->inbuf, idx->inbuf_len);
		git_hash_update(&idx->trailer, data, size - GIT_OID_RAWSZ);

		data += size - GIT_OID_RAWSZ;
		memcpy(idx->inbuf, data, GIT_OID_RAWSZ);
		idx->inbuf_len = GIT_OID_RAWSZ;
		return;
	}

	/* We can just append */
	if (idx->inbuf_len + size <= GIT_OID_RAWSZ) {
		memcpy(idx->inbuf + idx->inbuf_len, data, size);
		idx->inbuf_len += size;
		return;
	}

	/* We need to partially drain the buffer and then append */
594 595
	to_keep   = GIT_OID_RAWSZ - size;
	to_expell = idx->inbuf_len - to_keep;
596 597 598 599 600 601 602 603

	git_hash_update(&idx->trailer, idx->inbuf, to_expell);

	memmove(idx->inbuf, idx->inbuf + to_expell, to_keep);
	memcpy(idx->inbuf + to_keep, data, size);
	idx->inbuf_len += size - to_expell;
}

604
static int write_at(git_indexer *idx, const void *data, off64_t offset, size_t size)
605
{
606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622
#ifdef NO_MMAP
	size_t remaining_size = size;
	const char *ptr = (const char *)data;

	/* Handle data size larger that ssize_t */
	while (remaining_size > 0) {
		ssize_t nb;
		HANDLE_EINTR(nb, p_pwrite(idx->pack->mwf.fd, (void *)ptr,
					  remaining_size, offset));
		if (nb <= 0)
			return -1;

		ptr += nb;
		offset += nb;
		remaining_size -= nb;
	}
#else
623
	git_file fd = idx->pack->mwf.fd;
624
	size_t mmap_alignment;
625
	size_t page_offset;
626
	off64_t page_start;
627
	unsigned char *map_data;
628 629 630
	git_map map;
	int error;

631 632
	GIT_ASSERT_ARG(data);
	GIT_ASSERT_ARG(size);
633

634
	if ((error = git__mmap_alignment(&mmap_alignment)) < 0)
635 636
		return error;

637 638
	/* the offset needs to be at the mmap boundary for the platform */
	page_offset = offset % mmap_alignment;
639
	page_start = offset - page_offset;
640 641 642 643

	if ((error = p_mmap(&map, page_offset + size, GIT_PROT_WRITE, GIT_MAP_SHARED, fd, page_start)) < 0)
		return error;

644 645
	map_data = (unsigned char *)map.data;
	memcpy(map_data + page_offset, data, size);
646
	p_munmap(&map);
647
#endif
648 649 650 651 652 653

	return 0;
}

static int append_to_pack(git_indexer *idx, const void *data, size_t size)
{
654
	off64_t new_size;
655 656
	size_t mmap_alignment;
	size_t page_offset;
657 658
	off64_t page_start;
	off64_t current_size = idx->pack->mwf.size;
659
	int error;
660

661 662 663
	if (!size)
		return 0;

664 665 666 667 668 669 670 671 672 673 674
	if ((error = git__mmap_alignment(&mmap_alignment)) < 0)
		return error;

	/* Write a single byte to force the file system to allocate space now or
	 * report an error, since we can't report errors when writing using mmap.
	 * Round the size up to the nearest page so that we only need to perform file
	 * I/O when we add a page, instead of whenever we write even a single byte. */
	new_size = current_size + size;
	page_offset = new_size % mmap_alignment;
	page_start = new_size - page_offset;

675
	if (p_pwrite(idx->pack->mwf.fd, data, 1, page_start + mmap_alignment - 1) < 0) {
676
		git_error_set(GIT_ERROR_OS, "cannot extend packfile '%s'", idx->pack->pack_name);
677 678 679 680 681 682
		return -1;
	}

	return write_at(idx, data, idx->pack->mwf.size, size);
}

683
static int read_stream_object(git_indexer *idx, git_indexer_progress *stats)
684 685
{
	git_packfile_stream *stream = &idx->stream;
686
	off64_t entry_start = idx->off;
687
	size_t entry_size;
688
	git_object_t type;
689 690 691 692 693 694 695
	git_mwindow *w = NULL;
	int error;

	if (idx->pack->mwf.size <= idx->off + 20)
		return GIT_EBUFS;

	if (!idx->have_stream) {
696
		error = git_packfile_unpack_header(&entry_size, &type, idx->pack, &w, &idx->off);
697 698 699 700 701 702 703 704 705 706
		if (error == GIT_EBUFS) {
			idx->off = entry_start;
			return error;
		}
		if (error < 0)
			return error;

		git_mwindow_close(&w);
		idx->entry_start = entry_start;
		git_hash_init(&idx->hash_ctx);
707
		git_buf_clear(&idx->entry_data);
708

709
		if (type == GIT_OBJECT_REF_DELTA || type == GIT_OBJECT_OFS_DELTA) {
710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727
			error = advance_delta_offset(idx, type);
			if (error == GIT_EBUFS) {
				idx->off = entry_start;
				return error;
			}
			if (error < 0)
				return error;

			idx->have_delta = 1;
		} else {
			idx->have_delta = 0;

			error = hash_header(&idx->hash_ctx, entry_size, type);
			if (error < 0)
				return error;
		}

		idx->have_stream = 1;
728
		idx->entry_type = type;
729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771

		error = git_packfile_stream_open(stream, idx->pack, idx->off);
		if (error < 0)
			return error;
	}

	if (idx->have_delta) {
		error = read_object_stream(idx, stream);
	} else {
		error = hash_object_stream(idx, stream);
	}

	idx->off = stream->curpos;
	if (error == GIT_EBUFS)
		return error;

	/* We want to free the stream reasorces no matter what here */
	idx->have_stream = 0;
	git_packfile_stream_dispose(stream);

	if (error < 0)
		return error;

	if (idx->have_delta) {
		error = store_delta(idx);
	} else {
		error = store_object(idx);
	}

	if (error < 0)
		return error;

	if (!idx->have_delta) {
		stats->indexed_objects++;
	}
	stats->received_objects++;

	if ((error = do_progress_callback(idx, stats)) != 0)
		return error;

	return 0;
}

772
int git_indexer_append(git_indexer *idx, const void *data, size_t size, git_indexer_progress *stats)
773
{
774
	int error = -1;
775
	struct git_pack_header *hdr = &idx->hdr;
776
	git_mwindow_file *mwf = &idx->pack->mwf;
777

778 779 780
	GIT_ASSERT_ARG(idx);
	GIT_ASSERT_ARG(data);
	GIT_ASSERT_ARG(stats);
781

782
	if ((error = append_to_pack(idx, data, size)) < 0)
783
		return error;
784

nulltoken committed
785
	hash_partially(idx, data, (int)size);
786

787
	/* Make sure we set the new size of the pack */
788
	idx->pack->mwf.size += size;
789 790

	if (!idx->parsed_header) {
791 792
		unsigned int total_objects;

793
		if ((unsigned)idx->pack->mwf.size < sizeof(struct git_pack_header))
794 795
			return 0;

796 797
		if ((error = parse_header(&idx->hdr, idx->pack)) < 0)
			return error;
798 799

		idx->parsed_header = 1;
800
		idx->nr_objects = ntohl(hdr->hdr_entries);
801 802
		idx->off = sizeof(struct git_pack_header);

803
		if (idx->nr_objects <= git_indexer__max_objects) {
804
			total_objects = (unsigned int)idx->nr_objects;
805
		} else {
806
			git_error_set(GIT_ERROR_INDEXER, "too many objects");
807
			return -1;
808
		}
809

810 811
		if (git_oidmap_new(&idx->pack->idx_cache) < 0)
			return -1;
812 813

		idx->pack->has_cache = 1;
814
		if (git_vector_init(&idx->objects, total_objects, objects_cmp) < 0)
815 816
			return -1;

817
		if (git_vector_init(&idx->deltas, total_objects / 2, NULL) < 0)
818 819
			return -1;

820
		stats->received_objects = 0;
821
		stats->local_objects = 0;
822 823
		stats->total_deltas = 0;
		stats->indexed_deltas = 0;
824
		stats->indexed_objects = 0;
825
		stats->total_objects = total_objects;
826

827
		if ((error = do_progress_callback(idx, stats)) != 0)
828
			return error;
829 830 831 832 833
	}

	/* Now that we have data in the pack, let's try to parse it */

	/* As the file grows any windows we try to use will be out of date */
834 835
	if ((error = git_mwindow_free_all(mwf)) < 0)
		goto on_error;
836

837
	while (stats->indexed_objects < idx->nr_objects) {
838 839 840 841
		if ((error = read_stream_object(idx, stats)) != 0) {
			if (error == GIT_EBUFS)
				break;
			else
842
				goto on_error;
843
		}
844
	}
845

846
	return 0;
847

848 849
on_error:
	git_mwindow_free_all(mwf);
850
	return error;
851
}
852

853
static int index_path(git_buf *path, git_indexer *idx, const char *suffix)
854 855 856
{
	const char prefix[] = "pack-";
	size_t slash = (size_t)path->size;
857

858 859 860
	/* search backwards for '/' */
	while (slash > 0 && path->ptr[slash - 1] != '/')
		slash--;
861

862 863 864 865 866 867
	if (git_buf_grow(path, slash + 1 + strlen(prefix) +
					 GIT_OID_HEXSZ + strlen(suffix) + 1) < 0)
		return -1;

	git_buf_truncate(path, slash);
	git_buf_puts(path, prefix);
nulltoken committed
868
	git_oid_fmt(path->ptr + git_buf_len(path), &idx->hash);
869 870 871 872 873 874
	path->size += GIT_OID_HEXSZ;
	git_buf_puts(path, suffix);

	return git_buf_oom(path) ? -1 : 0;
}

875 876 877 878
/**
 * Rewind the packfile by the trailer, as we might need to fix the
 * packfile by injecting objects at the tail and must overwrite it.
 */
879
static int seek_back_trailer(git_indexer *idx)
880 881
{
	idx->pack->mwf.size -= GIT_OID_RAWSZ;
882
	return git_mwindow_free_all(&idx->pack->mwf);
883 884
}

885
static int inject_object(git_indexer *idx, git_oid *id)
886
{
887 888
	git_odb_object *obj = NULL;
	struct entry *entry = NULL;
889
	struct git_pack_entry *pentry = NULL;
890 891 892
	git_oid foo = {{0}};
	unsigned char hdr[64];
	git_buf buf = GIT_BUF_INIT;
893
	off64_t entry_start;
894 895 896 897
	const void *data;
	size_t len, hdr_len;
	int error;

898 899 900
	if ((error = seek_back_trailer(idx)) < 0)
		goto cleanup;

901
	entry_start = idx->pack->mwf.size;
902

903
	if ((error = git_odb_read(&obj, idx->odb, id)) < 0) {
904
		git_error_set(GIT_ERROR_INDEXER, "missing delta bases");
905
		goto cleanup;
906
	}
907 908 909 910

	data = git_odb_object_data(obj);
	len = git_odb_object_size(obj);

911
	entry = git__calloc(1, sizeof(*entry));
912
	GIT_ERROR_CHECK_ALLOC(entry);
913

914 915 916
	entry->crc = crc32(0L, Z_NULL, 0);

	/* Write out the object header */
Edward Thomson committed
917 918
	if ((error = git_packfile__object_header(&hdr_len, hdr, len, git_odb_object_type(obj))) < 0 ||
	    (error = append_to_pack(idx, hdr, hdr_len)) < 0)
919 920
		goto cleanup;

921
	idx->pack->mwf.size += hdr_len;
922
	entry->crc = crc32(entry->crc, hdr, (uInt)hdr_len);
923

924
	if ((error = git_zstream_deflatebuf(&buf, data, len)) < 0)
925 926 927
		goto cleanup;

	/* And then the compressed object */
928 929 930
	if ((error = append_to_pack(idx, buf.ptr, buf.size)) < 0)
		goto cleanup;

931
	idx->pack->mwf.size += buf.size;
Linquize committed
932
	entry->crc = htonl(crc32(entry->crc, (unsigned char *)buf.ptr, (uInt)buf.size));
933
	git_buf_dispose(&buf);
934 935

	/* Write a fake trailer so the pack functions play ball */
936 937

	if ((error = append_to_pack(idx, &foo, GIT_OID_RAWSZ)) < 0)
938 939 940 941 942
		goto cleanup;

	idx->pack->mwf.size += GIT_OID_RAWSZ;

	pentry = git__calloc(1, sizeof(struct git_pack_entry));
943
	GIT_ERROR_CHECK_ALLOC(pentry);
944 945 946 947 948

	git_oid_cpy(&pentry->sha1, id);
	git_oid_cpy(&entry->oid, id);
	idx->off = entry_start + hdr_len + len;

949
	error = save_entry(idx, entry, pentry, entry_start);
950 951

cleanup:
952 953 954 955
	if (error) {
		git__free(entry);
		git__free(pentry);
	}
956

957 958 959 960
	git_odb_object_free(obj);
	return error;
}

961
static int fix_thin_pack(git_indexer *idx, git_indexer_progress *stats)
962
{
963
	int error, found_ref_delta = 0;
964 965
	unsigned int i;
	struct delta_info *delta;
966
	size_t size;
967
	git_object_t type;
968
	git_mwindow *w = NULL;
969
	off64_t curpos = 0;
970 971 972 973
	unsigned char *base_info;
	unsigned int left = 0;
	git_oid base;

974
	GIT_ASSERT(git_vector_length(&idx->deltas) > 0);
975 976

	if (idx->odb == NULL) {
977
		git_error_set(GIT_ERROR_INDEXER, "cannot fix a thin pack without an ODB");
978 979
		return -1;
	}
980

981
	/* Loop until we find the first REF delta */
982
	git_vector_foreach(&idx->deltas, i, delta) {
983 984 985
		if (!delta)
			continue;

986
		curpos = delta->delta_off;
987
		error = git_packfile_unpack_header(&size, &type, idx->pack, &w, &curpos);
988 989 990
		if (error < 0)
			return error;

991
		if (type == GIT_OBJECT_REF_DELTA) {
992 993
			found_ref_delta = 1;
			break;
994
		}
995
	}
996

997
	if (!found_ref_delta) {
998
		git_error_set(GIT_ERROR_INDEXER, "no REF_DELTA found, cannot inject object");
999 1000
		return -1;
	}
1001

1002 1003 1004
	/* curpos now points to the base information, which is an OID */
	base_info = git_mwindow_open(&idx->pack->mwf, &w, curpos, GIT_OID_RAWSZ, &left);
	if (base_info == NULL) {
1005
		git_error_set(GIT_ERROR_INDEXER, "failed to map delta information");
1006 1007
		return -1;
	}
1008

1009 1010
	git_oid_fromraw(&base, base_info);
	git_mwindow_close(&w);
1011

1012 1013 1014
	if (has_entry(idx, &base))
		return 0;

1015 1016 1017 1018
	if (inject_object(idx, &base) < 0)
		return -1;

	stats->local_objects++;
1019 1020 1021 1022

	return 0;
}

1023
static int resolve_deltas(git_indexer *idx, git_indexer_progress *stats)
1024 1025
{
	unsigned int i;
lhchavez committed
1026
	int error;
1027
	struct delta_info *delta;
1028
	int progressed = 0, non_null = 0, progress_cb_result;
1029 1030 1031

	while (idx->deltas.length > 0) {
		progressed = 0;
1032
		non_null = 0;
1033
		git_vector_foreach(&idx->deltas, i, delta) {
1034
			git_rawobj obj = {0};
1035

1036 1037 1038 1039
			if (!delta)
				continue;

			non_null = 1;
1040
			idx->off = delta->delta_off;
lhchavez committed
1041 1042 1043 1044 1045 1046 1047
			if ((error = git_packfile_unpack(&obj, idx->pack, &idx->off)) < 0) {
				if (error == GIT_PASSTHROUGH) {
					/* We have not seen the base object, we'll try again later. */
					continue;
				}
				return -1;
			}
1048

1049 1050 1051 1052
			if (idx->do_verify && check_object_connectivity(idx, &obj) < 0)
				/* TODO: error? continue? */
				continue;

1053 1054 1055 1056 1057
			if (hash_and_save(idx, &obj, delta->delta_off) < 0)
				continue;

			git__free(obj.data);
			stats->indexed_objects++;
1058
			stats->indexed_deltas++;
1059
			progressed = 1;
1060 1061
			if ((progress_cb_result = do_progress_callback(idx, stats)) < 0)
				return progress_cb_result;
1062

1063 1064
			/* remove from the list */
			git_vector_set(NULL, &idx->deltas, i, NULL);
1065
			git__free(delta);
1066
		}
1067

1068 1069 1070 1071
		/* if none were actually set, we're done */
		if (!non_null)
			break;

1072
		if (!progressed && (fix_thin_pack(idx, stats) < 0)) {
1073
			return -1;
1074
		}
1075 1076 1077 1078 1079
	}

	return 0;
}

1080
static int update_header_and_rehash(git_indexer *idx, git_indexer_progress *stats)
1081 1082 1083
{
	void *ptr;
	size_t chunk = 1024*1024;
1084
	off64_t hashed = 0;
1085 1086 1087 1088 1089 1090
	git_mwindow *w = NULL;
	git_mwindow_file *mwf;
	unsigned int left;

	mwf = &idx->pack->mwf;

1091
	git_hash_init(&idx->trailer);
1092

1093 1094

	/* Update the header to include the numer of local objects we injected */
1095
	idx->hdr.hdr_entries = htonl(stats->total_objects + stats->local_objects);
1096
	if (write_at(idx, &idx->hdr, 0, sizeof(struct git_pack_header)) < 0)
1097
		return -1;
1098

1099 1100 1101 1102 1103 1104
	/*
	 * We now use the same technique as before to determine the
	 * hash. We keep reading up to the end and let
	 * hash_partially() keep the existing trailer out of the
	 * calculation.
	 */
1105 1106 1107
	if (git_mwindow_free_all(mwf) < 0)
		return -1;

1108 1109 1110 1111
	idx->inbuf_len = 0;
	while (hashed < mwf->size) {
		ptr = git_mwindow_open(mwf, &w, hashed, chunk, &left);
		if (ptr == NULL)
1112
			return -1;
1113

1114 1115 1116 1117
		hash_partially(idx, ptr, left);
		hashed += left;

		git_mwindow_close(&w);
1118
	}
1119

1120 1121 1122
	return 0;
}

1123
int git_indexer_commit(git_indexer *idx, git_indexer_progress *stats)
1124 1125 1126
{
	git_mwindow *w = NULL;
	unsigned int i, long_offsets = 0, left;
1127
	int error;
1128 1129 1130
	struct git_pack_idx_header hdr;
	git_buf filename = GIT_BUF_INIT;
	struct entry *entry;
1131
	git_oid trailer_hash, file_hash;
1132
	git_filebuf index_file = {0};
1133
	void *packfile_trailer;
1134

1135
	if (!idx->parsed_header) {
1136
		git_error_set(GIT_ERROR_INDEXER, "incomplete pack header");
1137 1138 1139
		return -1;
	}

1140
	/* Test for this before resolve_deltas(), as it plays with idx->off */
1141
	if (idx->off + 20 < idx->pack->mwf.size) {
1142
		git_error_set(GIT_ERROR_INDEXER, "unexpected data at the end of the pack");
1143 1144
		return -1;
	}
1145
	if (idx->off + 20 > idx->pack->mwf.size) {
1146
		git_error_set(GIT_ERROR_INDEXER, "missing trailer at the end of the pack");
1147 1148
		return -1;
	}
1149

1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161
	packfile_trailer = git_mwindow_open(&idx->pack->mwf, &w, idx->pack->mwf.size - GIT_OID_RAWSZ, GIT_OID_RAWSZ, &left);
	if (packfile_trailer == NULL) {
		git_mwindow_close(&w);
		goto on_error;
	}

	/* Compare the packfile trailer as it was sent to us and what we calculated */
	git_oid_fromraw(&file_hash, packfile_trailer);
	git_mwindow_close(&w);

	git_hash_final(&trailer_hash, &idx->trailer);
	if (git_oid_cmp(&file_hash, &trailer_hash)) {
1162
		git_error_set(GIT_ERROR_INDEXER, "packfile trailer mismatch");
1163 1164 1165
		return -1;
	}

1166 1167 1168
	/* Freeze the number of deltas */
	stats->total_deltas = stats->total_objects - stats->indexed_objects;

1169 1170
	if ((error = resolve_deltas(idx, stats)) < 0)
		return error;
1171

1172
	if (stats->indexed_objects != stats->total_objects) {
1173
		git_error_set(GIT_ERROR_INDEXER, "early EOF");
1174 1175 1176
		return -1;
	}

1177 1178 1179 1180 1181
	if (stats->local_objects > 0) {
		if (update_header_and_rehash(idx, stats) < 0)
			return -1;

		git_hash_final(&trailer_hash, &idx->trailer);
1182
		write_at(idx, &trailer_hash, idx->pack->mwf.size - GIT_OID_RAWSZ, GIT_OID_RAWSZ);
1183 1184
	}

1185 1186 1187 1188 1189 1190 1191
	/*
	 * Is the resulting graph fully connected or are we still
	 * missing some objects? In the second case, we can
	 * bail out due to an incomplete and thus corrupt
	 * packfile.
	 */
	if (git_oidmap_size(idx->expected_oids) > 0) {
1192
		git_error_set(GIT_ERROR_INDEXER, "packfile is missing %"PRIuZ" objects",
1193 1194 1195 1196
			git_oidmap_size(idx->expected_oids));
		return -1;
	}

1197 1198
	git_vector_sort(&idx->objects);

1199 1200 1201 1202
	/* Use the trailer hash as the pack file name to ensure
	 * files with different contents have different names */
	git_oid_cpy(&idx->hash, &trailer_hash);

1203
	git_buf_sets(&filename, idx->pack->pack_name);
1204
	git_buf_shorten(&filename, strlen("pack"));
1205 1206 1207 1208
	git_buf_puts(&filename, "idx");
	if (git_buf_oom(&filename))
		return -1;

1209
	if (git_filebuf_open(&index_file, filename.ptr,
1210
		GIT_FILEBUF_HASH_CONTENTS |
1211
		(idx->do_fsync ? GIT_FILEBUF_FSYNC : 0),
1212
		idx->mode) < 0)
1213 1214 1215 1216 1217
		goto on_error;

	/* Write out the header */
	hdr.idx_signature = htonl(PACK_IDX_SIGNATURE);
	hdr.idx_version = htonl(2);
1218
	git_filebuf_write(&index_file, &hdr, sizeof(hdr));
1219 1220 1221 1222

	/* Write out the fanout table */
	for (i = 0; i < 256; ++i) {
		uint32_t n = htonl(idx->fanout[i]);
1223
		git_filebuf_write(&index_file, &n, sizeof(n));
1224 1225
	}

1226 1227
	/* Write out the object names (SHA-1 hashes) */
	git_vector_foreach(&idx->objects, i, entry) {
1228
		git_filebuf_write(&index_file, &entry->oid, sizeof(git_oid));
1229 1230 1231 1232
	}

	/* Write out the CRC32 values */
	git_vector_foreach(&idx->objects, i, entry) {
1233
		git_filebuf_write(&index_file, &entry->crc, sizeof(uint32_t));
1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244
	}

	/* Write out the offsets */
	git_vector_foreach(&idx->objects, i, entry) {
		uint32_t n;

		if (entry->offset == UINT32_MAX)
			n = htonl(0x80000000 | long_offsets++);
		else
			n = htonl(entry->offset);

1245
		git_filebuf_write(&index_file, &n, sizeof(uint32_t));
1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257
	}

	/* Write out the long offsets */
	git_vector_foreach(&idx->objects, i, entry) {
		uint32_t split[2];

		if (entry->offset != UINT32_MAX)
			continue;

		split[0] = htonl(entry->offset_long >> 32);
		split[1] = htonl(entry->offset_long & 0xffffffff);

1258
		git_filebuf_write(&index_file, &split, sizeof(uint32_t) * 2);
1259 1260
	}

1261 1262
	/* Write out the packfile trailer to the index */
	if (git_filebuf_write(&index_file, &trailer_hash, GIT_OID_RAWSZ) < 0)
1263 1264
		goto on_error;

1265 1266
	/* Write out the hash of the idx */
	if (git_filebuf_hash(&trailer_hash, &index_file) < 0)
1267 1268
		goto on_error;

1269
	git_filebuf_write(&index_file, &trailer_hash, sizeof(git_oid));
1270 1271

	/* Figure out what the final name should be */
1272
	if (index_path(&filename, idx, ".idx") < 0)
1273 1274 1275
		goto on_error;

	/* Commit file */
1276
	if (git_filebuf_commit_at(&index_file, filename.ptr) < 0)
1277 1278
		goto on_error;

1279 1280
	if (git_mwindow_free_all(&idx->pack->mwf) < 0)
		goto on_error;
1281 1282 1283

	/* Truncate file to undo rounding up to next page_size in append_to_pack */
	if (p_ftruncate(idx->pack->mwf.fd, idx->pack->mwf.size) < 0) {
1284
		git_error_set(GIT_ERROR_OS, "failed to truncate pack file '%s'", idx->pack->pack_name);
1285 1286 1287
		return -1;
	}

1288
	if (idx->do_fsync && p_fsync(idx->pack->mwf.fd) < 0) {
1289
		git_error_set(GIT_ERROR_OS, "failed to fsync packfile");
1290 1291 1292
		goto on_error;
	}

1293
	/* We need to close the descriptor here so Windows doesn't choke on commit_at */
1294
	if (p_close(idx->pack->mwf.fd) < 0) {
1295
		git_error_set(GIT_ERROR_OS, "failed to close packfile");
1296 1297 1298
		goto on_error;
	}

1299
	idx->pack->mwf.fd = -1;
1300

1301
	if (index_path(&filename, idx, ".pack") < 0)
1302
		goto on_error;
1303

1304
	/* And don't forget to rename the packfile to its new place. */
1305 1306 1307 1308
	if (p_rename(idx->pack->pack_name, git_buf_cstr(&filename)) < 0)
		goto on_error;

	/* And fsync the parent directory if we're asked to. */
1309
	if (idx->do_fsync &&
1310 1311 1312
		git_futils_fsync_parent(git_buf_cstr(&filename)) < 0)
		goto on_error;

1313
	idx->pack_committed = 1;
1314

1315
	git_buf_dispose(&filename);
1316 1317 1318
	return 0;

on_error:
1319
	git_mwindow_free_all(&idx->pack->mwf);
1320
	git_filebuf_cleanup(&index_file);
1321
	git_buf_dispose(&filename);
1322 1323 1324
	return -1;
}

1325
void git_indexer_free(git_indexer *idx)
1326
{
1327 1328 1329
	const git_oid *key;
	git_oid *value;
	size_t iter;
1330

1331 1332 1333
	if (idx == NULL)
		return;

1334
	if (idx->have_stream)
1335
		git_packfile_stream_dispose(&idx->stream);
1336

1337
	git_vector_free_deep(&idx->objects);
1338

1339
	if (idx->pack->idx_cache) {
Russell Belfer committed
1340
		struct git_pack_entry *pentry;
1341 1342 1343
		git_oidmap_foreach_value(idx->pack->idx_cache, pentry, {
			git__free(pentry);
		});
1344 1345

		git_oidmap_free(idx->pack->idx_cache);
1346
	}
1347

1348
	git_vector_free_deep(&idx->deltas);
1349

1350
	git_packfile_free(idx->pack, !idx->pack_committed);
1351

1352 1353 1354
	iter = 0;
	while (git_oidmap_iterate((void **) &value, idx->expected_oids, &iter, &key) == 0)
		git__free(value);
1355

1356 1357
	git_hash_ctx_cleanup(&idx->trailer);
	git_hash_ctx_cleanup(&idx->hash_ctx);
1358 1359
	git_buf_dispose(&idx->entry_data);
	git_oidmap_free(idx->expected_oids);
1360 1361
	git__free(idx);
}