indexer.c 31.3 KB
Newer Older
1
/*
Edward Thomson committed
2
 * Copyright (C) the libgit2 contributors. All rights reserved.
3
 *
Vicent Marti committed
4 5
 * This file is part of libgit2, distributed under the GNU GPL v2 with
 * a Linking Exception. For full terms see the included COPYING file.
6 7
 */

8 9
#include "indexer.h"

Carlos Martín Nieto committed
10
#include "git2/indexer.h"
11
#include "git2/object.h"
Carlos Martín Nieto committed
12

13 14 15
#include "commit.h"
#include "tree.h"
#include "tag.h"
16
#include "pack.h"
Carlos Martín Nieto committed
17
#include "mwindow.h"
18
#include "posix.h"
19 20
#include "pack.h"
#include "filebuf.h"
21
#include "oid.h"
22
#include "oidarray.h"
23
#include "oidmap.h"
24
#include "zstream.h"
25
#include "object.h"
26

27 28
extern git_mutex git__mwindow_mutex;

29
size_t git_indexer__max_objects = UINT32_MAX;
30

31
#define UINT31_MAX (0x7FFFFFFF)
32

33
struct entry {
34
	git_oid oid;
35 36 37 38 39
	uint32_t crc;
	uint32_t offset;
	uint64_t offset_long;
};

40
struct git_indexer {
41
	unsigned int parsed_header :1,
42
		pack_committed :1,
43
		have_stream :1,
44
		have_delta :1,
45 46
		do_fsync :1,
		do_verify :1;
47
	struct git_pack_header hdr;
48
	struct git_pack_file *pack;
49
	unsigned int mode;
50 51
	off64_t off;
	off64_t entry_start;
52
	git_object_t entry_type;
53
	git_buf entry_data;
54
	git_packfile_stream stream;
55 56 57 58
	size_t nr_objects;
	git_vector objects;
	git_vector deltas;
	unsigned int fanout[256];
59
	git_hash_ctx hash_ctx;
60
	git_oid hash;
61
	git_indexer_progress_cb progress_cb;
62
	void *progress_payload;
63
	char objbuf[8*1024];
64

65 66 67
	/* OIDs referenced from pack objects. Used for verification. */
	git_oidmap *expected_oids;

68 69 70
	/* Needed to look up objects which we want to inject to fix a thin pack */
	git_odb *odb;

71 72
	/* Fields for calculating the packfile trailer (hash of everything before it) */
	char inbuf[GIT_OID_RAWSZ];
73
	size_t inbuf_len;
74
	git_hash_ctx trailer;
75 76 77
};

struct delta_info {
78
	off64_t delta_off;
79 80
};

81
const git_oid *git_indexer_hash(const git_indexer *idx)
82 83 84 85
{
	return &idx->hash;
}

86
static int parse_header(struct git_pack_header *hdr, struct git_pack_file *pack)
87 88
{
	int error;
89
	git_map map;
90

91
	if ((error = p_mmap(&map, sizeof(*hdr), GIT_PROT_READ, GIT_MAP_SHARED, pack->mwf.fd, 0)) < 0)
92
		return error;
93

94 95 96 97
	memcpy(hdr, map.data, sizeof(*hdr));
	p_munmap(&map);

	/* Verify we recognize this pack file format. */
98
	if (hdr->hdr_signature != ntohl(PACK_SIGNATURE)) {
99
		git_error_set(GIT_ERROR_INDEXER, "wrong pack signature");
100 101
		return -1;
	}
102

103
	if (!pack_version_ok(hdr->hdr_version)) {
104
		git_error_set(GIT_ERROR_INDEXER, "wrong pack version");
105 106
		return -1;
	}
Carlos Martín Nieto committed
107

108
	return 0;
109 110
}

111
static int objects_cmp(const void *a, const void *b)
112 113 114 115
{
	const struct entry *entrya = a;
	const struct entry *entryb = b;

116
	return git_oid__cmp(&entrya->oid, &entryb->oid);
117 118
}

119
int git_indexer_options_init(git_indexer_options *opts, unsigned int version)
120 121 122 123 124 125
{
	GIT_INIT_STRUCTURE_FROM_TEMPLATE(
		opts, version, git_indexer_options, GIT_INDEXER_OPTIONS_INIT);
	return 0;
}

126 127 128 129 130
int git_indexer_init_options(git_indexer_options *opts, unsigned int version)
{
	return git_indexer_options_init(opts, version);
}

131 132
int git_indexer_new(
		git_indexer **out,
133
		const char *prefix,
134
		unsigned int mode,
135
		git_odb *odb,
136
		git_indexer_options *in_opts)
137
{
138
	git_indexer_options opts = GIT_INDEXER_OPTIONS_INIT;
139
	git_indexer *idx;
140
	git_buf path = GIT_BUF_INIT, tmp_path = GIT_BUF_INIT;
141
	static const char suff[] = "/pack";
142
	int error, fd = -1;
143

144 145 146
	if (in_opts)
		memcpy(&opts, in_opts, sizeof(opts));

147
	idx = git__calloc(1, sizeof(git_indexer));
148
	GIT_ERROR_CHECK_ALLOC(idx);
149
	idx->odb = odb;
150 151
	idx->progress_cb = opts.progress_cb;
	idx->progress_payload = opts.progress_cb_payload;
152
	idx->mode = mode ? mode : GIT_PACK_FILE_MODE;
153
	git_buf_init(&idx->entry_data, 0);
154

155 156 157
	if ((error = git_hash_ctx_init(&idx->hash_ctx)) < 0 ||
	    (error = git_hash_ctx_init(&idx->trailer)) < 0 ||
	    (error = git_oidmap_new(&idx->expected_oids)) < 0)
158
		goto cleanup;
159

160
	idx->do_verify = opts.verify;
161

162
	if (git_repository__fsync_gitdir)
163 164
		idx->do_fsync = 1;

165 166 167 168
	error = git_buf_joinpath(&path, prefix, suff);
	if (error < 0)
		goto cleanup;

169
	fd = git_futils_mktmp(&tmp_path, git_buf_cstr(&path), idx->mode);
170
	git_buf_dispose(&path);
171 172 173 174
	if (fd < 0)
		goto cleanup;

	error = git_packfile_alloc(&idx->pack, git_buf_cstr(&tmp_path));
175
	git_buf_dispose(&tmp_path);
176

177 178 179
	if (error < 0)
		goto cleanup;

180 181 182 183
	idx->pack->mwf.fd = fd;
	if ((error = git_mwindow_file_register(&idx->pack->mwf)) < 0)
		goto cleanup;

184 185 186 187
	*out = idx;
	return 0;

cleanup:
188 189 190
	if (fd != -1)
		p_close(fd);

lhchavez committed
191 192
	if (git_buf_len(&tmp_path) > 0)
		p_unlink(git_buf_cstr(&tmp_path));
193 194

	if (idx->pack != NULL)
lhchavez committed
195
		p_unlink(idx->pack->pack_name);
196

197 198
	git_buf_dispose(&path);
	git_buf_dispose(&tmp_path);
199 200 201 202
	git__free(idx);
	return -1;
}

203 204 205 206 207
void git_indexer__set_fsync(git_indexer *idx, int do_fsync)
{
	idx->do_fsync = !!do_fsync;
}

208
/* Try to store the delta so we can try to resolve it later */
209
static int store_delta(git_indexer *idx)
210
{
211 212
	struct delta_info *delta;

213
	delta = git__calloc(1, sizeof(struct delta_info));
214
	GIT_ERROR_CHECK_ALLOC(delta);
215
	delta->delta_off = idx->entry_start;
216

217
	if (git_vector_insert(&idx->deltas, delta) < 0)
218 219 220 221 222
		return -1;

	return 0;
}

223
static int hash_header(git_hash_ctx *ctx, off64_t len, git_object_t type)
224 225 226
{
	char buffer[64];
	size_t hdrlen;
227 228 229 230 231
	int error;

	if ((error = git_odb__format_object_header(&hdrlen,
		buffer, sizeof(buffer), (size_t)len, type)) < 0)
		return error;
232

233
	return git_hash_update(ctx, buffer, hdrlen);
234 235
}

236
static int hash_object_stream(git_indexer*idx, git_packfile_stream *stream)
237 238 239
{
	ssize_t read;

240
	assert(idx && stream);
241 242

	do {
243
		if ((read = git_packfile_stream_read(stream, idx->objbuf, sizeof(idx->objbuf))) < 0)
244 245
			break;

246 247 248
		if (idx->do_verify)
			git_buf_put(&idx->entry_data, idx->objbuf, read);

249
		git_hash_update(&idx->hash_ctx, idx->objbuf, read);
250 251 252 253 254 255 256 257
	} while (read > 0);

	if (read < 0)
		return (int)read;

	return 0;
}

258
/* In order to create the packfile stream, we need to skip over the delta base description */
259
static int advance_delta_offset(git_indexer *idx, git_object_t type)
260 261 262
{
	git_mwindow *w = NULL;

263
	assert(type == GIT_OBJECT_REF_DELTA || type == GIT_OBJECT_OFS_DELTA);
264

265
	if (type == GIT_OBJECT_REF_DELTA) {
266 267
		idx->off += GIT_OID_RAWSZ;
	} else {
268 269
		off64_t base_off;
		int error = get_delta_base(&base_off, idx->pack, &w, &idx->off, type, idx->entry_start);
270
		git_mwindow_close(&w);
271 272
		if (error < 0)
			return error;
273 274 275 276 277 278
	}

	return 0;
}

/* Read from the stream and discard any output */
279
static int read_object_stream(git_indexer *idx, git_packfile_stream *stream)
280 281 282 283 284 285
{
	ssize_t read;

	assert(stream);

	do {
286
		read = git_packfile_stream_read(stream, idx->objbuf, sizeof(idx->objbuf));
287 288 289 290 291 292 293 294
	} while (read > 0);

	if (read < 0)
		return (int)read;

	return 0;
}

295
static int crc_object(uint32_t *crc_out, git_mwindow_file *mwf, off64_t start, off64_t size)
296 297 298 299 300 301 302 303
{
	void *ptr;
	uint32_t crc;
	unsigned int left, len;
	git_mwindow *w = NULL;

	crc = crc32(0L, Z_NULL, 0);
	while (size) {
304
		ptr = git_mwindow_open(mwf, &w, start, (size_t)size, &left);
305 306 307
		if (ptr == NULL)
			return -1;

308
		len = min(left, (unsigned int)size);
309 310 311 312 313 314 315 316 317 318
		crc = crc32(crc, ptr, len);
		size -= len;
		start += len;
		git_mwindow_close(&w);
	}

	*crc_out = htonl(crc);
	return 0;
}

319
static int add_expected_oid(git_indexer *idx, const git_oid *oid)
320 321 322 323 324 325
{
	/*
	 * If we know about that object because it is stored in our ODB or
	 * because we have already processed it as part of our pack file, we do
	 * not have to expect it.
	 */
326
	if ((!idx->odb || !git_odb_exists(idx->odb, oid)) &&
327 328 329
	    !git_oidmap_exists(idx->pack->idx_cache, oid) &&
	    !git_oidmap_exists(idx->expected_oids, oid)) {
		    git_oid *dup = git__malloc(sizeof(*oid));
330
		    GIT_ERROR_CHECK_ALLOC(dup);
331
		    git_oid_cpy(dup, oid);
332
		    return git_oidmap_set(idx->expected_oids, dup, dup);
333
	}
334 335

	return 0;
336 337 338 339 340
}

static int check_object_connectivity(git_indexer *idx, const git_rawobj *obj)
{
	git_object *object;
341
	git_oid *expected;
342 343
	int error;

344 345 346 347
	if (obj->type != GIT_OBJECT_BLOB &&
	    obj->type != GIT_OBJECT_TREE &&
	    obj->type != GIT_OBJECT_COMMIT &&
	    obj->type != GIT_OBJECT_TAG)
348 349 350 351 352
		return 0;

	if ((error = git_object__from_raw(&object, obj->data, obj->len, obj->type)) < 0)
		goto out;

353 354 355
	if ((expected = git_oidmap_get(idx->expected_oids, &object->cached.oid)) != NULL) {
		git_oidmap_delete(idx->expected_oids, &object->cached.oid);
		git__free(expected);
356 357 358 359 360 361
	}

	/*
	 * Check whether this is a known object. If so, we can just continue as
	 * we assume that the ODB has a complete graph.
	 */
362
	if (idx->odb && git_odb_exists(idx->odb, &object->cached.oid))
363 364 365
		return 0;

	switch (obj->type) {
366
		case GIT_OBJECT_TREE:
367 368 369 370 371 372
		{
			git_tree *tree = (git_tree *) object;
			git_tree_entry *entry;
			size_t i;

			git_array_foreach(tree->entries, i, entry)
373 374
				if (add_expected_oid(idx, entry->oid) < 0)
					goto out;
375 376 377

			break;
		}
378
		case GIT_OBJECT_COMMIT:
379 380 381 382 383 384
		{
			git_commit *commit = (git_commit *) object;
			git_oid *parent_oid;
			size_t i;

			git_array_foreach(commit->parent_ids, i, parent_oid)
385 386
				if (add_expected_oid(idx, parent_oid) < 0)
					goto out;
387

388 389
			if (add_expected_oid(idx, &commit->tree_id) < 0)
				goto out;
390 391 392

			break;
		}
393
		case GIT_OBJECT_TAG:
394 395 396
		{
			git_tag *tag = (git_tag *) object;

397 398
			if (add_expected_oid(idx, &tag->target) < 0)
				goto out;
399 400 401

			break;
		}
402
		case GIT_OBJECT_BLOB:
403 404 405 406 407 408 409 410 411 412
		default:
			break;
	}

out:
	git_object_free(object);

	return error;
}

413
static int store_object(git_indexer *idx)
414
{
415
	int i, error;
416 417
	git_oid oid;
	struct entry *entry;
418
	off64_t entry_size;
419
	struct git_pack_entry *pentry;
420
	off64_t entry_start = idx->entry_start;
421 422

	entry = git__calloc(1, sizeof(*entry));
423
	GIT_ERROR_CHECK_ALLOC(entry);
424

Linquize committed
425
	pentry = git__calloc(1, sizeof(struct git_pack_entry));
426
	GIT_ERROR_CHECK_ALLOC(pentry);
427

428
	git_hash_final(&oid, &idx->hash_ctx);
429 430 431 432 433 434 435 436
	entry_size = idx->off - entry_start;
	if (entry_start > UINT31_MAX) {
		entry->offset = UINT32_MAX;
		entry->offset_long = entry_start;
	} else {
		entry->offset = (uint32_t)entry_start;
	}

437 438 439 440 441 442 443 444 445 446 447
	if (idx->do_verify) {
		git_rawobj rawobj = {
		    idx->entry_data.ptr,
		    idx->entry_data.size,
		    idx->entry_type
		};

		if ((error = check_object_connectivity(idx, &rawobj)) < 0)
			goto on_error;
	}

448 449
	git_oid_cpy(&pentry->sha1, &oid);
	pentry->offset = entry_start;
450

451 452
	if (git_oidmap_exists(idx->pack->idx_cache, &pentry->sha1)) {
		git_error_set(GIT_ERROR_INDEXER, "duplicate object %s found in pack", git_oid_tostr_s(&pentry->sha1));
453
		git__free(pentry);
454
		goto on_error;
455
	}
456

457
	if ((error = git_oidmap_set(idx->pack->idx_cache, &pentry->sha1, pentry)) < 0) {
458
		git__free(pentry);
459
		git_error_set_oom();
460 461 462
		goto on_error;
	}

463 464
	git_oid_cpy(&entry->oid, &oid);

465
	if (crc_object(&entry->crc, &idx->pack->mwf, entry_start, entry_size) < 0)
466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483
		goto on_error;

	/* Add the object to the list */
	if (git_vector_insert(&idx->objects, entry) < 0)
		goto on_error;

	for (i = oid.id[0]; i < 256; ++i) {
		idx->fanout[i]++;
	}

	return 0;

on_error:
	git__free(entry);

	return -1;
}

484 485
GIT_INLINE(bool) has_entry(git_indexer *idx, git_oid *id)
{
486
	return git_oidmap_exists(idx->pack->idx_cache, id);
487 488
}

489
static int save_entry(git_indexer *idx, struct entry *entry, struct git_pack_entry *pentry, off64_t entry_start)
490
{
491
	int i;
492 493 494 495 496 497 498 499

	if (entry_start > UINT31_MAX) {
		entry->offset = UINT32_MAX;
		entry->offset_long = entry_start;
	} else {
		entry->offset = (uint32_t)entry_start;
	}

500
	pentry->offset = entry_start;
501

502 503
	if (git_oidmap_exists(idx->pack->idx_cache, &pentry->sha1) ||
	    git_oidmap_set(idx->pack->idx_cache, &pentry->sha1, pentry) < 0) {
504
		git_error_set(GIT_ERROR_INDEXER, "cannot insert object into pack");
505
		return -1;
506
	}
507 508 509 510 511 512 513 514 515 516 517 518

	/* Add the object to the list */
	if (git_vector_insert(&idx->objects, entry) < 0)
		return -1;

	for (i = entry->oid.id[0]; i < 256; ++i) {
		idx->fanout[i]++;
	}

	return 0;
}

519
static int hash_and_save(git_indexer *idx, git_rawobj *obj, off64_t entry_start)
520 521 522 523
{
	git_oid oid;
	size_t entry_size;
	struct entry *entry;
524
	struct git_pack_entry *pentry = NULL;
525 526

	entry = git__calloc(1, sizeof(*entry));
527
	GIT_ERROR_CHECK_ALLOC(entry);
528

529
	if (git_odb__hashobj(&oid, obj) < 0) {
530
		git_error_set(GIT_ERROR_INDEXER, "failed to hash object");
531
		goto on_error;
532 533
	}

Linquize committed
534
	pentry = git__calloc(1, sizeof(struct git_pack_entry));
535
	GIT_ERROR_CHECK_ALLOC(pentry);
536 537 538 539 540 541

	git_oid_cpy(&pentry->sha1, &oid);
	git_oid_cpy(&entry->oid, &oid);
	entry->crc = crc32(0L, Z_NULL, 0);

	entry_size = (size_t)(idx->off - entry_start);
542
	if (crc_object(&entry->crc, &idx->pack->mwf, entry_start, entry_size) < 0)
543 544
		goto on_error;

545
	return save_entry(idx, entry, pentry, entry_start);
546

547
on_error:
548
	git__free(pentry);
549 550
	git__free(entry);
	git__free(obj->data);
551 552
	return -1;
}
553

554
static int do_progress_callback(git_indexer *idx, git_indexer_progress *stats)
555
{
556
	if (idx->progress_cb)
557
		return git_error_set_after_callback_function(
558 559
			idx->progress_cb(stats, idx->progress_payload),
			"indexer progress");
560
	return 0;
561 562
}

563
/* Hash everything but the last 20B of input */
564
static void hash_partially(git_indexer *idx, const uint8_t *data, size_t size)
565
{
566
	size_t to_expell, to_keep;
567 568 569 570 571

	if (size == 0)
		return;

	/* Easy case, dump the buffer and the data minus the last 20 bytes */
572
	if (size >= GIT_OID_RAWSZ) {
573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589
		git_hash_update(&idx->trailer, idx->inbuf, idx->inbuf_len);
		git_hash_update(&idx->trailer, data, size - GIT_OID_RAWSZ);

		data += size - GIT_OID_RAWSZ;
		memcpy(idx->inbuf, data, GIT_OID_RAWSZ);
		idx->inbuf_len = GIT_OID_RAWSZ;
		return;
	}

	/* We can just append */
	if (idx->inbuf_len + size <= GIT_OID_RAWSZ) {
		memcpy(idx->inbuf + idx->inbuf_len, data, size);
		idx->inbuf_len += size;
		return;
	}

	/* We need to partially drain the buffer and then append */
590 591
	to_keep   = GIT_OID_RAWSZ - size;
	to_expell = idx->inbuf_len - to_keep;
592 593 594 595 596 597 598 599

	git_hash_update(&idx->trailer, idx->inbuf, to_expell);

	memmove(idx->inbuf, idx->inbuf + to_expell, to_keep);
	memcpy(idx->inbuf + to_keep, data, size);
	idx->inbuf_len += size - to_expell;
}

600
static int write_at(git_indexer *idx, const void *data, off64_t offset, size_t size)
601 602
{
	git_file fd = idx->pack->mwf.fd;
603
	size_t mmap_alignment;
604
	size_t page_offset;
605
	off64_t page_start;
606
	unsigned char *map_data;
607 608 609
	git_map map;
	int error;

610 611
	assert(data && size);

612
	if ((error = git__mmap_alignment(&mmap_alignment)) < 0)
613 614
		return error;

615 616
	/* the offset needs to be at the mmap boundary for the platform */
	page_offset = offset % mmap_alignment;
617
	page_start = offset - page_offset;
618 619 620 621

	if ((error = p_mmap(&map, page_offset + size, GIT_PROT_WRITE, GIT_MAP_SHARED, fd, page_start)) < 0)
		return error;

622 623
	map_data = (unsigned char *)map.data;
	memcpy(map_data + page_offset, data, size);
624 625 626 627 628 629 630
	p_munmap(&map);

	return 0;
}

static int append_to_pack(git_indexer *idx, const void *data, size_t size)
{
631
	off64_t new_size;
632 633
	size_t mmap_alignment;
	size_t page_offset;
634 635
	off64_t page_start;
	off64_t current_size = idx->pack->mwf.size;
636
	int fd = idx->pack->mwf.fd;
637
	int error;
638

639 640 641
	if (!size)
		return 0;

642 643 644 645 646 647 648 649 650 651 652 653
	if ((error = git__mmap_alignment(&mmap_alignment)) < 0)
		return error;

	/* Write a single byte to force the file system to allocate space now or
	 * report an error, since we can't report errors when writing using mmap.
	 * Round the size up to the nearest page so that we only need to perform file
	 * I/O when we add a page, instead of whenever we write even a single byte. */
	new_size = current_size + size;
	page_offset = new_size % mmap_alignment;
	page_start = new_size - page_offset;

	if (p_lseek(fd, page_start + mmap_alignment - 1, SEEK_SET) < 0 ||
654
	    p_write(idx->pack->mwf.fd, data, 1) < 0) {
655
		git_error_set(GIT_ERROR_OS, "cannot extend packfile '%s'", idx->pack->pack_name);
656 657 658 659 660 661
		return -1;
	}

	return write_at(idx, data, idx->pack->mwf.size, size);
}

662
static int read_stream_object(git_indexer *idx, git_indexer_progress *stats)
663 664
{
	git_packfile_stream *stream = &idx->stream;
665
	off64_t entry_start = idx->off;
666
	size_t entry_size;
667
	git_object_t type;
668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685
	git_mwindow *w = NULL;
	int error;

	if (idx->pack->mwf.size <= idx->off + 20)
		return GIT_EBUFS;

	if (!idx->have_stream) {
		error = git_packfile_unpack_header(&entry_size, &type, &idx->pack->mwf, &w, &idx->off);
		if (error == GIT_EBUFS) {
			idx->off = entry_start;
			return error;
		}
		if (error < 0)
			return error;

		git_mwindow_close(&w);
		idx->entry_start = entry_start;
		git_hash_init(&idx->hash_ctx);
686
		git_buf_clear(&idx->entry_data);
687

688
		if (type == GIT_OBJECT_REF_DELTA || type == GIT_OBJECT_OFS_DELTA) {
689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706
			error = advance_delta_offset(idx, type);
			if (error == GIT_EBUFS) {
				idx->off = entry_start;
				return error;
			}
			if (error < 0)
				return error;

			idx->have_delta = 1;
		} else {
			idx->have_delta = 0;

			error = hash_header(&idx->hash_ctx, entry_size, type);
			if (error < 0)
				return error;
		}

		idx->have_stream = 1;
707
		idx->entry_type = type;
708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750

		error = git_packfile_stream_open(stream, idx->pack, idx->off);
		if (error < 0)
			return error;
	}

	if (idx->have_delta) {
		error = read_object_stream(idx, stream);
	} else {
		error = hash_object_stream(idx, stream);
	}

	idx->off = stream->curpos;
	if (error == GIT_EBUFS)
		return error;

	/* We want to free the stream reasorces no matter what here */
	idx->have_stream = 0;
	git_packfile_stream_dispose(stream);

	if (error < 0)
		return error;

	if (idx->have_delta) {
		error = store_delta(idx);
	} else {
		error = store_object(idx);
	}

	if (error < 0)
		return error;

	if (!idx->have_delta) {
		stats->indexed_objects++;
	}
	stats->received_objects++;

	if ((error = do_progress_callback(idx, stats)) != 0)
		return error;

	return 0;
}

751
int git_indexer_append(git_indexer *idx, const void *data, size_t size, git_indexer_progress *stats)
752
{
753
	int error = -1;
754
	struct git_pack_header *hdr = &idx->hdr;
755
	git_mwindow_file *mwf = &idx->pack->mwf;
756

757 758
	assert(idx && data && stats);

759
	if ((error = append_to_pack(idx, data, size)) < 0)
760
		return error;
761

nulltoken committed
762
	hash_partially(idx, data, (int)size);
763

764
	/* Make sure we set the new size of the pack */
765
	idx->pack->mwf.size += size;
766 767

	if (!idx->parsed_header) {
768 769
		unsigned int total_objects;

770
		if ((unsigned)idx->pack->mwf.size < sizeof(struct git_pack_header))
771 772
			return 0;

773 774
		if ((error = parse_header(&idx->hdr, idx->pack)) < 0)
			return error;
775 776

		idx->parsed_header = 1;
777
		idx->nr_objects = ntohl(hdr->hdr_entries);
778 779
		idx->off = sizeof(struct git_pack_header);

780
		if (idx->nr_objects <= git_indexer__max_objects) {
781
			total_objects = (unsigned int)idx->nr_objects;
782
		} else {
783
			git_error_set(GIT_ERROR_INDEXER, "too many objects");
784
			return -1;
785
		}
786

787 788
		if (git_oidmap_new(&idx->pack->idx_cache) < 0)
			return -1;
789 790

		idx->pack->has_cache = 1;
791
		if (git_vector_init(&idx->objects, total_objects, objects_cmp) < 0)
792 793
			return -1;

794
		if (git_vector_init(&idx->deltas, total_objects / 2, NULL) < 0)
795 796
			return -1;

797
		stats->received_objects = 0;
798
		stats->local_objects = 0;
799 800
		stats->total_deltas = 0;
		stats->indexed_deltas = 0;
801
		stats->indexed_objects = 0;
802
		stats->total_objects = total_objects;
803

804
		if ((error = do_progress_callback(idx, stats)) != 0)
805
			return error;
806 807 808 809 810 811
	}

	/* Now that we have data in the pack, let's try to parse it */

	/* As the file grows any windows we try to use will be out of date */
	git_mwindow_free_all(mwf);
812

813
	while (stats->indexed_objects < idx->nr_objects) {
814 815 816 817
		if ((error = read_stream_object(idx, stats)) != 0) {
			if (error == GIT_EBUFS)
				break;
			else
818
				goto on_error;
819
		}
820
	}
821

822
	return 0;
823

824 825
on_error:
	git_mwindow_free_all(mwf);
826
	return error;
827
}
828

829
static int index_path(git_buf *path, git_indexer *idx, const char *suffix)
830 831 832
{
	const char prefix[] = "pack-";
	size_t slash = (size_t)path->size;
833

834 835 836
	/* search backwards for '/' */
	while (slash > 0 && path->ptr[slash - 1] != '/')
		slash--;
837

838 839 840 841 842 843
	if (git_buf_grow(path, slash + 1 + strlen(prefix) +
					 GIT_OID_HEXSZ + strlen(suffix) + 1) < 0)
		return -1;

	git_buf_truncate(path, slash);
	git_buf_puts(path, prefix);
nulltoken committed
844
	git_oid_fmt(path->ptr + git_buf_len(path), &idx->hash);
845 846 847 848 849 850
	path->size += GIT_OID_HEXSZ;
	git_buf_puts(path, suffix);

	return git_buf_oom(path) ? -1 : 0;
}

851 852 853 854
/**
 * Rewind the packfile by the trailer, as we might need to fix the
 * packfile by injecting objects at the tail and must overwrite it.
 */
855
static void seek_back_trailer(git_indexer *idx)
856 857 858 859 860
{
	idx->pack->mwf.size -= GIT_OID_RAWSZ;
	git_mwindow_free_all(&idx->pack->mwf);
}

861
static int inject_object(git_indexer *idx, git_oid *id)
862
{
863 864
	git_odb_object *obj;
	struct entry *entry;
865
	struct git_pack_entry *pentry = NULL;
866 867 868
	git_oid foo = {{0}};
	unsigned char hdr[64];
	git_buf buf = GIT_BUF_INIT;
869
	off64_t entry_start;
870 871 872 873
	const void *data;
	size_t len, hdr_len;
	int error;

874 875
	seek_back_trailer(idx);
	entry_start = idx->pack->mwf.size;
876

877
	if (git_odb_read(&obj, idx->odb, id) < 0) {
878
		git_error_set(GIT_ERROR_INDEXER, "missing delta bases");
879
		return -1;
880
	}
881 882 883 884

	data = git_odb_object_data(obj);
	len = git_odb_object_size(obj);

885
	entry = git__calloc(1, sizeof(*entry));
886
	GIT_ERROR_CHECK_ALLOC(entry);
887

888 889 890 891
	entry->crc = crc32(0L, Z_NULL, 0);

	/* Write out the object header */
	hdr_len = git_packfile__object_header(hdr, len, git_odb_object_type(obj));
892 893 894
	if ((error = append_to_pack(idx, hdr, hdr_len)) < 0)
		goto cleanup;

895
	idx->pack->mwf.size += hdr_len;
896
	entry->crc = crc32(entry->crc, hdr, (uInt)hdr_len);
897

898
	if ((error = git_zstream_deflatebuf(&buf, data, len)) < 0)
899 900 901
		goto cleanup;

	/* And then the compressed object */
902 903 904
	if ((error = append_to_pack(idx, buf.ptr, buf.size)) < 0)
		goto cleanup;

905
	idx->pack->mwf.size += buf.size;
Linquize committed
906
	entry->crc = htonl(crc32(entry->crc, (unsigned char *)buf.ptr, (uInt)buf.size));
907
	git_buf_dispose(&buf);
908 909

	/* Write a fake trailer so the pack functions play ball */
910 911

	if ((error = append_to_pack(idx, &foo, GIT_OID_RAWSZ)) < 0)
912 913 914 915 916
		goto cleanup;

	idx->pack->mwf.size += GIT_OID_RAWSZ;

	pentry = git__calloc(1, sizeof(struct git_pack_entry));
917
	GIT_ERROR_CHECK_ALLOC(pentry);
918 919 920 921 922

	git_oid_cpy(&pentry->sha1, id);
	git_oid_cpy(&entry->oid, id);
	idx->off = entry_start + hdr_len + len;

923
	error = save_entry(idx, entry, pentry, entry_start);
924 925

cleanup:
926 927 928 929
	if (error) {
		git__free(entry);
		git__free(pentry);
	}
930

931 932 933 934
	git_odb_object_free(obj);
	return error;
}

935
static int fix_thin_pack(git_indexer *idx, git_indexer_progress *stats)
936
{
937
	int error, found_ref_delta = 0;
938 939
	unsigned int i;
	struct delta_info *delta;
940
	size_t size;
941
	git_object_t type;
942
	git_mwindow *w = NULL;
943
	off64_t curpos = 0;
944 945 946 947 948
	unsigned char *base_info;
	unsigned int left = 0;
	git_oid base;

	assert(git_vector_length(&idx->deltas) > 0);
949 950

	if (idx->odb == NULL) {
951
		git_error_set(GIT_ERROR_INDEXER, "cannot fix a thin pack without an ODB");
952 953
		return -1;
	}
954

955
	/* Loop until we find the first REF delta */
956
	git_vector_foreach(&idx->deltas, i, delta) {
957 958 959
		if (!delta)
			continue;

960
		curpos = delta->delta_off;
961 962 963 964
		error = git_packfile_unpack_header(&size, &type, &idx->pack->mwf, &w, &curpos);
		if (error < 0)
			return error;

965
		if (type == GIT_OBJECT_REF_DELTA) {
966 967
			found_ref_delta = 1;
			break;
968
		}
969
	}
970

971
	if (!found_ref_delta) {
972
		git_error_set(GIT_ERROR_INDEXER, "no REF_DELTA found, cannot inject object");
973 974
		return -1;
	}
975

976 977 978
	/* curpos now points to the base information, which is an OID */
	base_info = git_mwindow_open(&idx->pack->mwf, &w, curpos, GIT_OID_RAWSZ, &left);
	if (base_info == NULL) {
979
		git_error_set(GIT_ERROR_INDEXER, "failed to map delta information");
980 981
		return -1;
	}
982

983 984
	git_oid_fromraw(&base, base_info);
	git_mwindow_close(&w);
985

986 987 988
	if (has_entry(idx, &base))
		return 0;

989 990 991 992
	if (inject_object(idx, &base) < 0)
		return -1;

	stats->local_objects++;
993 994 995 996

	return 0;
}

997
static int resolve_deltas(git_indexer *idx, git_indexer_progress *stats)
998 999
{
	unsigned int i;
lhchavez committed
1000
	int error;
1001
	struct delta_info *delta;
1002
	int progressed = 0, non_null = 0, progress_cb_result;
1003 1004 1005

	while (idx->deltas.length > 0) {
		progressed = 0;
1006
		non_null = 0;
1007
		git_vector_foreach(&idx->deltas, i, delta) {
1008
			git_rawobj obj = {0};
1009

1010 1011 1012 1013
			if (!delta)
				continue;

			non_null = 1;
1014
			idx->off = delta->delta_off;
lhchavez committed
1015 1016 1017 1018 1019 1020 1021
			if ((error = git_packfile_unpack(&obj, idx->pack, &idx->off)) < 0) {
				if (error == GIT_PASSTHROUGH) {
					/* We have not seen the base object, we'll try again later. */
					continue;
				}
				return -1;
			}
1022

1023 1024 1025 1026
			if (idx->do_verify && check_object_connectivity(idx, &obj) < 0)
				/* TODO: error? continue? */
				continue;

1027 1028 1029 1030 1031
			if (hash_and_save(idx, &obj, delta->delta_off) < 0)
				continue;

			git__free(obj.data);
			stats->indexed_objects++;
1032
			stats->indexed_deltas++;
1033
			progressed = 1;
1034 1035
			if ((progress_cb_result = do_progress_callback(idx, stats)) < 0)
				return progress_cb_result;
1036

1037 1038
			/* remove from the list */
			git_vector_set(NULL, &idx->deltas, i, NULL);
1039
			git__free(delta);
1040
		}
1041

1042 1043 1044 1045
		/* if none were actually set, we're done */
		if (!non_null)
			break;

1046
		if (!progressed && (fix_thin_pack(idx, stats) < 0)) {
1047
			return -1;
1048
		}
1049 1050 1051 1052 1053
	}

	return 0;
}

1054
static int update_header_and_rehash(git_indexer *idx, git_indexer_progress *stats)
1055 1056 1057
{
	void *ptr;
	size_t chunk = 1024*1024;
1058
	off64_t hashed = 0;
1059 1060 1061 1062 1063 1064
	git_mwindow *w = NULL;
	git_mwindow_file *mwf;
	unsigned int left;

	mwf = &idx->pack->mwf;

1065
	git_hash_init(&idx->trailer);
1066

1067 1068

	/* Update the header to include the numer of local objects we injected */
1069
	idx->hdr.hdr_entries = htonl(stats->total_objects + stats->local_objects);
1070
	if (write_at(idx, &idx->hdr, 0, sizeof(struct git_pack_header)) < 0)
1071
		return -1;
1072

1073 1074 1075 1076 1077 1078
	/*
	 * We now use the same technique as before to determine the
	 * hash. We keep reading up to the end and let
	 * hash_partially() keep the existing trailer out of the
	 * calculation.
	 */
1079
	git_mwindow_free_all(mwf);
1080 1081 1082 1083
	idx->inbuf_len = 0;
	while (hashed < mwf->size) {
		ptr = git_mwindow_open(mwf, &w, hashed, chunk, &left);
		if (ptr == NULL)
1084
			return -1;
1085

1086 1087 1088 1089
		hash_partially(idx, ptr, left);
		hashed += left;

		git_mwindow_close(&w);
1090
	}
1091

1092 1093 1094
	return 0;
}

1095
int git_indexer_commit(git_indexer *idx, git_indexer_progress *stats)
1096 1097 1098
{
	git_mwindow *w = NULL;
	unsigned int i, long_offsets = 0, left;
1099
	int error;
1100 1101 1102
	struct git_pack_idx_header hdr;
	git_buf filename = GIT_BUF_INIT;
	struct entry *entry;
1103
	git_oid trailer_hash, file_hash;
1104
	git_filebuf index_file = {0};
1105
	void *packfile_trailer;
1106

1107
	if (!idx->parsed_header) {
1108
		git_error_set(GIT_ERROR_INDEXER, "incomplete pack header");
1109 1110 1111
		return -1;
	}

1112
	/* Test for this before resolve_deltas(), as it plays with idx->off */
1113
	if (idx->off + 20 < idx->pack->mwf.size) {
1114
		git_error_set(GIT_ERROR_INDEXER, "unexpected data at the end of the pack");
1115 1116
		return -1;
	}
1117
	if (idx->off + 20 > idx->pack->mwf.size) {
1118
		git_error_set(GIT_ERROR_INDEXER, "missing trailer at the end of the pack");
1119 1120
		return -1;
	}
1121

1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133
	packfile_trailer = git_mwindow_open(&idx->pack->mwf, &w, idx->pack->mwf.size - GIT_OID_RAWSZ, GIT_OID_RAWSZ, &left);
	if (packfile_trailer == NULL) {
		git_mwindow_close(&w);
		goto on_error;
	}

	/* Compare the packfile trailer as it was sent to us and what we calculated */
	git_oid_fromraw(&file_hash, packfile_trailer);
	git_mwindow_close(&w);

	git_hash_final(&trailer_hash, &idx->trailer);
	if (git_oid_cmp(&file_hash, &trailer_hash)) {
1134
		git_error_set(GIT_ERROR_INDEXER, "packfile trailer mismatch");
1135 1136 1137
		return -1;
	}

1138 1139 1140
	/* Freeze the number of deltas */
	stats->total_deltas = stats->total_objects - stats->indexed_objects;

1141 1142
	if ((error = resolve_deltas(idx, stats)) < 0)
		return error;
1143

1144
	if (stats->indexed_objects != stats->total_objects) {
1145
		git_error_set(GIT_ERROR_INDEXER, "early EOF");
1146 1147 1148
		return -1;
	}

1149 1150 1151 1152 1153
	if (stats->local_objects > 0) {
		if (update_header_and_rehash(idx, stats) < 0)
			return -1;

		git_hash_final(&trailer_hash, &idx->trailer);
1154
		write_at(idx, &trailer_hash, idx->pack->mwf.size - GIT_OID_RAWSZ, GIT_OID_RAWSZ);
1155 1156
	}

1157 1158 1159 1160 1161 1162 1163
	/*
	 * Is the resulting graph fully connected or are we still
	 * missing some objects? In the second case, we can
	 * bail out due to an incomplete and thus corrupt
	 * packfile.
	 */
	if (git_oidmap_size(idx->expected_oids) > 0) {
1164
		git_error_set(GIT_ERROR_INDEXER, "packfile is missing %"PRIuZ" objects",
1165 1166 1167 1168
			git_oidmap_size(idx->expected_oids));
		return -1;
	}

1169 1170
	git_vector_sort(&idx->objects);

1171 1172 1173 1174
	/* Use the trailer hash as the pack file name to ensure
	 * files with different contents have different names */
	git_oid_cpy(&idx->hash, &trailer_hash);

1175
	git_buf_sets(&filename, idx->pack->pack_name);
1176
	git_buf_shorten(&filename, strlen("pack"));
1177 1178 1179 1180
	git_buf_puts(&filename, "idx");
	if (git_buf_oom(&filename))
		return -1;

1181
	if (git_filebuf_open(&index_file, filename.ptr,
1182
		GIT_FILEBUF_HASH_CONTENTS |
1183
		(idx->do_fsync ? GIT_FILEBUF_FSYNC : 0),
1184
		idx->mode) < 0)
1185 1186 1187 1188 1189
		goto on_error;

	/* Write out the header */
	hdr.idx_signature = htonl(PACK_IDX_SIGNATURE);
	hdr.idx_version = htonl(2);
1190
	git_filebuf_write(&index_file, &hdr, sizeof(hdr));
1191 1192 1193 1194

	/* Write out the fanout table */
	for (i = 0; i < 256; ++i) {
		uint32_t n = htonl(idx->fanout[i]);
1195
		git_filebuf_write(&index_file, &n, sizeof(n));
1196 1197
	}

1198 1199
	/* Write out the object names (SHA-1 hashes) */
	git_vector_foreach(&idx->objects, i, entry) {
1200
		git_filebuf_write(&index_file, &entry->oid, sizeof(git_oid));
1201 1202 1203 1204
	}

	/* Write out the CRC32 values */
	git_vector_foreach(&idx->objects, i, entry) {
1205
		git_filebuf_write(&index_file, &entry->crc, sizeof(uint32_t));
1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216
	}

	/* Write out the offsets */
	git_vector_foreach(&idx->objects, i, entry) {
		uint32_t n;

		if (entry->offset == UINT32_MAX)
			n = htonl(0x80000000 | long_offsets++);
		else
			n = htonl(entry->offset);

1217
		git_filebuf_write(&index_file, &n, sizeof(uint32_t));
1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229
	}

	/* Write out the long offsets */
	git_vector_foreach(&idx->objects, i, entry) {
		uint32_t split[2];

		if (entry->offset != UINT32_MAX)
			continue;

		split[0] = htonl(entry->offset_long >> 32);
		split[1] = htonl(entry->offset_long & 0xffffffff);

1230
		git_filebuf_write(&index_file, &split, sizeof(uint32_t) * 2);
1231 1232
	}

1233 1234
	/* Write out the packfile trailer to the index */
	if (git_filebuf_write(&index_file, &trailer_hash, GIT_OID_RAWSZ) < 0)
1235 1236
		goto on_error;

1237 1238
	/* Write out the hash of the idx */
	if (git_filebuf_hash(&trailer_hash, &index_file) < 0)
1239 1240
		goto on_error;

1241
	git_filebuf_write(&index_file, &trailer_hash, sizeof(git_oid));
1242 1243

	/* Figure out what the final name should be */
1244
	if (index_path(&filename, idx, ".idx") < 0)
1245 1246 1247
		goto on_error;

	/* Commit file */
1248
	if (git_filebuf_commit_at(&index_file, filename.ptr) < 0)
1249 1250 1251
		goto on_error;

	git_mwindow_free_all(&idx->pack->mwf);
1252 1253 1254

	/* Truncate file to undo rounding up to next page_size in append_to_pack */
	if (p_ftruncate(idx->pack->mwf.fd, idx->pack->mwf.size) < 0) {
1255
		git_error_set(GIT_ERROR_OS, "failed to truncate pack file '%s'", idx->pack->pack_name);
1256 1257 1258
		return -1;
	}

1259
	if (idx->do_fsync && p_fsync(idx->pack->mwf.fd) < 0) {
1260
		git_error_set(GIT_ERROR_OS, "failed to fsync packfile");
1261 1262 1263
		goto on_error;
	}

1264
	/* We need to close the descriptor here so Windows doesn't choke on commit_at */
1265
	if (p_close(idx->pack->mwf.fd) < 0) {
1266
		git_error_set(GIT_ERROR_OS, "failed to close packfile");
1267 1268 1269
		goto on_error;
	}

1270
	idx->pack->mwf.fd = -1;
1271

1272
	if (index_path(&filename, idx, ".pack") < 0)
1273
		goto on_error;
1274

1275
	/* And don't forget to rename the packfile to its new place. */
1276 1277 1278 1279
	if (p_rename(idx->pack->pack_name, git_buf_cstr(&filename)) < 0)
		goto on_error;

	/* And fsync the parent directory if we're asked to. */
1280
	if (idx->do_fsync &&
1281 1282 1283
		git_futils_fsync_parent(git_buf_cstr(&filename)) < 0)
		goto on_error;

1284
	idx->pack_committed = 1;
1285

1286
	git_buf_dispose(&filename);
1287 1288 1289
	return 0;

on_error:
1290
	git_mwindow_free_all(&idx->pack->mwf);
1291
	git_filebuf_cleanup(&index_file);
1292
	git_buf_dispose(&filename);
1293 1294 1295
	return -1;
}

1296
void git_indexer_free(git_indexer *idx)
1297
{
1298 1299 1300
	const git_oid *key;
	git_oid *value;
	size_t iter;
1301

1302 1303 1304
	if (idx == NULL)
		return;

1305
	if (idx->have_stream)
1306
		git_packfile_stream_dispose(&idx->stream);
1307

1308
	git_vector_free_deep(&idx->objects);
1309

1310
	if (idx->pack->idx_cache) {
Russell Belfer committed
1311
		struct git_pack_entry *pentry;
1312 1313 1314
		git_oidmap_foreach_value(idx->pack->idx_cache, pentry, {
			git__free(pentry);
		});
1315 1316

		git_oidmap_free(idx->pack->idx_cache);
1317
	}
1318

1319
	git_vector_free_deep(&idx->deltas);
1320 1321

	if (!git_mutex_lock(&git__mwindow_mutex)) {
1322 1323 1324
		if (!idx->pack_committed)
			git_packfile_close(idx->pack, true);

1325 1326 1327 1328
		git_packfile_free(idx->pack);
		git_mutex_unlock(&git__mwindow_mutex);
	}

1329 1330 1331
	iter = 0;
	while (git_oidmap_iterate((void **) &value, idx->expected_oids, &iter, &key) == 0)
		git__free(value);
1332

1333 1334
	git_hash_ctx_cleanup(&idx->trailer);
	git_hash_ctx_cleanup(&idx->hash_ctx);
1335 1336
	git_buf_dispose(&idx->entry_data);
	git_oidmap_free(idx->expected_oids);
1337 1338
	git__free(idx);
}