indexer.c 31.3 KB
Newer Older
1
/*
Edward Thomson committed
2
 * Copyright (C) the libgit2 contributors. All rights reserved.
3
 *
Vicent Marti committed
4 5
 * This file is part of libgit2, distributed under the GNU GPL v2 with
 * a Linking Exception. For full terms see the included COPYING file.
6 7
 */

8 9
#include "indexer.h"

Carlos Martín Nieto committed
10
#include "git2/indexer.h"
11
#include "git2/object.h"
Carlos Martín Nieto committed
12

13 14 15
#include "commit.h"
#include "tree.h"
#include "tag.h"
16
#include "pack.h"
Carlos Martín Nieto committed
17
#include "mwindow.h"
18
#include "posix.h"
19 20
#include "pack.h"
#include "filebuf.h"
21
#include "oid.h"
22
#include "oidarray.h"
23
#include "oidmap.h"
24
#include "zstream.h"
25
#include "object.h"
26

27 28
extern git_mutex git__mwindow_mutex;

29
size_t git_indexer__max_objects = UINT32_MAX;
30

31
#define UINT31_MAX (0x7FFFFFFF)
32

33
struct entry {
34
	git_oid oid;
35 36 37 38 39
	uint32_t crc;
	uint32_t offset;
	uint64_t offset_long;
};

40
struct git_indexer {
41
	unsigned int parsed_header :1,
42
		pack_committed :1,
43
		have_stream :1,
44
		have_delta :1,
45 46
		do_fsync :1,
		do_verify :1;
47
	struct git_pack_header hdr;
48
	struct git_pack_file *pack;
49
	unsigned int mode;
50 51
	off64_t off;
	off64_t entry_start;
52
	git_object_t entry_type;
53
	git_buf entry_data;
54
	git_packfile_stream stream;
55 56 57 58
	size_t nr_objects;
	git_vector objects;
	git_vector deltas;
	unsigned int fanout[256];
59
	git_hash_ctx hash_ctx;
60
	git_oid hash;
61
	git_indexer_progress_cb progress_cb;
62
	void *progress_payload;
63
	char objbuf[8*1024];
64

65 66 67
	/* OIDs referenced from pack objects. Used for verification. */
	git_oidmap *expected_oids;

68 69 70
	/* Needed to look up objects which we want to inject to fix a thin pack */
	git_odb *odb;

71 72
	/* Fields for calculating the packfile trailer (hash of everything before it) */
	char inbuf[GIT_OID_RAWSZ];
73
	size_t inbuf_len;
74
	git_hash_ctx trailer;
75 76 77
};

struct delta_info {
78
	off64_t delta_off;
79 80
};

81
const git_oid *git_indexer_hash(const git_indexer *idx)
82 83 84 85
{
	return &idx->hash;
}

86
static int parse_header(struct git_pack_header *hdr, struct git_pack_file *pack)
87 88
{
	int error;
89
	git_map map;
90

91
	if ((error = p_mmap(&map, sizeof(*hdr), GIT_PROT_READ, GIT_MAP_SHARED, pack->mwf.fd, 0)) < 0)
92
		return error;
93

94 95 96 97
	memcpy(hdr, map.data, sizeof(*hdr));
	p_munmap(&map);

	/* Verify we recognize this pack file format. */
98
	if (hdr->hdr_signature != ntohl(PACK_SIGNATURE)) {
99
		git_error_set(GIT_ERROR_INDEXER, "wrong pack signature");
100 101
		return -1;
	}
102

103
	if (!pack_version_ok(hdr->hdr_version)) {
104
		git_error_set(GIT_ERROR_INDEXER, "wrong pack version");
105 106
		return -1;
	}
Carlos Martín Nieto committed
107

108
	return 0;
109 110
}

111
static int objects_cmp(const void *a, const void *b)
112 113 114 115
{
	const struct entry *entrya = a;
	const struct entry *entryb = b;

116
	return git_oid__cmp(&entrya->oid, &entryb->oid);
117 118
}

119
int git_indexer_options_init(git_indexer_options *opts, unsigned int version)
120 121 122 123 124 125
{
	GIT_INIT_STRUCTURE_FROM_TEMPLATE(
		opts, version, git_indexer_options, GIT_INDEXER_OPTIONS_INIT);
	return 0;
}

126
#ifndef GIT_DEPRECATE_HARD
127 128 129 130
int git_indexer_init_options(git_indexer_options *opts, unsigned int version)
{
	return git_indexer_options_init(opts, version);
}
131
#endif
132

133 134
int git_indexer_new(
		git_indexer **out,
135
		const char *prefix,
136
		unsigned int mode,
137
		git_odb *odb,
138
		git_indexer_options *in_opts)
139
{
140
	git_indexer_options opts = GIT_INDEXER_OPTIONS_INIT;
141
	git_indexer *idx;
142
	git_buf path = GIT_BUF_INIT, tmp_path = GIT_BUF_INIT;
143
	static const char suff[] = "/pack";
144
	int error, fd = -1;
145

146 147 148
	if (in_opts)
		memcpy(&opts, in_opts, sizeof(opts));

149
	idx = git__calloc(1, sizeof(git_indexer));
150
	GIT_ERROR_CHECK_ALLOC(idx);
151
	idx->odb = odb;
152 153
	idx->progress_cb = opts.progress_cb;
	idx->progress_payload = opts.progress_cb_payload;
154
	idx->mode = mode ? mode : GIT_PACK_FILE_MODE;
155
	git_buf_init(&idx->entry_data, 0);
156

157 158 159
	if ((error = git_hash_ctx_init(&idx->hash_ctx)) < 0 ||
	    (error = git_hash_ctx_init(&idx->trailer)) < 0 ||
	    (error = git_oidmap_new(&idx->expected_oids)) < 0)
160
		goto cleanup;
161

162
	idx->do_verify = opts.verify;
163

164
	if (git_repository__fsync_gitdir)
165 166
		idx->do_fsync = 1;

167 168 169 170
	error = git_buf_joinpath(&path, prefix, suff);
	if (error < 0)
		goto cleanup;

171
	fd = git_futils_mktmp(&tmp_path, git_buf_cstr(&path), idx->mode);
172
	git_buf_dispose(&path);
173 174 175 176
	if (fd < 0)
		goto cleanup;

	error = git_packfile_alloc(&idx->pack, git_buf_cstr(&tmp_path));
177
	git_buf_dispose(&tmp_path);
178

179 180 181
	if (error < 0)
		goto cleanup;

182 183 184 185
	idx->pack->mwf.fd = fd;
	if ((error = git_mwindow_file_register(&idx->pack->mwf)) < 0)
		goto cleanup;

186 187 188 189
	*out = idx;
	return 0;

cleanup:
190 191 192
	if (fd != -1)
		p_close(fd);

lhchavez committed
193 194
	if (git_buf_len(&tmp_path) > 0)
		p_unlink(git_buf_cstr(&tmp_path));
195 196

	if (idx->pack != NULL)
lhchavez committed
197
		p_unlink(idx->pack->pack_name);
198

199 200
	git_buf_dispose(&path);
	git_buf_dispose(&tmp_path);
201 202 203 204
	git__free(idx);
	return -1;
}

205 206 207 208 209
void git_indexer__set_fsync(git_indexer *idx, int do_fsync)
{
	idx->do_fsync = !!do_fsync;
}

210
/* Try to store the delta so we can try to resolve it later */
211
static int store_delta(git_indexer *idx)
212
{
213 214
	struct delta_info *delta;

215
	delta = git__calloc(1, sizeof(struct delta_info));
216
	GIT_ERROR_CHECK_ALLOC(delta);
217
	delta->delta_off = idx->entry_start;
218

219
	if (git_vector_insert(&idx->deltas, delta) < 0)
220 221 222 223 224
		return -1;

	return 0;
}

225
static int hash_header(git_hash_ctx *ctx, off64_t len, git_object_t type)
226 227 228
{
	char buffer[64];
	size_t hdrlen;
229 230 231 232 233
	int error;

	if ((error = git_odb__format_object_header(&hdrlen,
		buffer, sizeof(buffer), (size_t)len, type)) < 0)
		return error;
234

235
	return git_hash_update(ctx, buffer, hdrlen);
236 237
}

238
static int hash_object_stream(git_indexer*idx, git_packfile_stream *stream)
239 240 241
{
	ssize_t read;

242
	assert(idx && stream);
243 244

	do {
245
		if ((read = git_packfile_stream_read(stream, idx->objbuf, sizeof(idx->objbuf))) < 0)
246 247
			break;

248 249 250
		if (idx->do_verify)
			git_buf_put(&idx->entry_data, idx->objbuf, read);

251
		git_hash_update(&idx->hash_ctx, idx->objbuf, read);
252 253 254 255 256 257 258 259
	} while (read > 0);

	if (read < 0)
		return (int)read;

	return 0;
}

260
/* In order to create the packfile stream, we need to skip over the delta base description */
261
static int advance_delta_offset(git_indexer *idx, git_object_t type)
262 263 264
{
	git_mwindow *w = NULL;

265
	assert(type == GIT_OBJECT_REF_DELTA || type == GIT_OBJECT_OFS_DELTA);
266

267
	if (type == GIT_OBJECT_REF_DELTA) {
268 269
		idx->off += GIT_OID_RAWSZ;
	} else {
270 271
		off64_t base_off;
		int error = get_delta_base(&base_off, idx->pack, &w, &idx->off, type, idx->entry_start);
272
		git_mwindow_close(&w);
273 274
		if (error < 0)
			return error;
275 276 277 278 279 280
	}

	return 0;
}

/* Read from the stream and discard any output */
281
static int read_object_stream(git_indexer *idx, git_packfile_stream *stream)
282 283 284 285 286 287
{
	ssize_t read;

	assert(stream);

	do {
288
		read = git_packfile_stream_read(stream, idx->objbuf, sizeof(idx->objbuf));
289 290 291 292 293 294 295 296
	} while (read > 0);

	if (read < 0)
		return (int)read;

	return 0;
}

297
static int crc_object(uint32_t *crc_out, git_mwindow_file *mwf, off64_t start, off64_t size)
298 299 300 301 302 303 304 305
{
	void *ptr;
	uint32_t crc;
	unsigned int left, len;
	git_mwindow *w = NULL;

	crc = crc32(0L, Z_NULL, 0);
	while (size) {
306
		ptr = git_mwindow_open(mwf, &w, start, (size_t)size, &left);
307 308 309
		if (ptr == NULL)
			return -1;

310
		len = min(left, (unsigned int)size);
311 312 313 314 315 316 317 318 319 320
		crc = crc32(crc, ptr, len);
		size -= len;
		start += len;
		git_mwindow_close(&w);
	}

	*crc_out = htonl(crc);
	return 0;
}

321
static int add_expected_oid(git_indexer *idx, const git_oid *oid)
322 323 324 325 326 327
{
	/*
	 * If we know about that object because it is stored in our ODB or
	 * because we have already processed it as part of our pack file, we do
	 * not have to expect it.
	 */
328
	if ((!idx->odb || !git_odb_exists(idx->odb, oid)) &&
329 330 331
	    !git_oidmap_exists(idx->pack->idx_cache, oid) &&
	    !git_oidmap_exists(idx->expected_oids, oid)) {
		    git_oid *dup = git__malloc(sizeof(*oid));
332
		    GIT_ERROR_CHECK_ALLOC(dup);
333
		    git_oid_cpy(dup, oid);
334
		    return git_oidmap_set(idx->expected_oids, dup, dup);
335
	}
336 337

	return 0;
338 339 340 341 342
}

static int check_object_connectivity(git_indexer *idx, const git_rawobj *obj)
{
	git_object *object;
343
	git_oid *expected;
344 345
	int error;

346 347 348 349
	if (obj->type != GIT_OBJECT_BLOB &&
	    obj->type != GIT_OBJECT_TREE &&
	    obj->type != GIT_OBJECT_COMMIT &&
	    obj->type != GIT_OBJECT_TAG)
350 351 352 353 354
		return 0;

	if ((error = git_object__from_raw(&object, obj->data, obj->len, obj->type)) < 0)
		goto out;

355 356 357
	if ((expected = git_oidmap_get(idx->expected_oids, &object->cached.oid)) != NULL) {
		git_oidmap_delete(idx->expected_oids, &object->cached.oid);
		git__free(expected);
358 359 360 361 362 363
	}

	/*
	 * Check whether this is a known object. If so, we can just continue as
	 * we assume that the ODB has a complete graph.
	 */
364
	if (idx->odb && git_odb_exists(idx->odb, &object->cached.oid))
365 366 367
		return 0;

	switch (obj->type) {
368
		case GIT_OBJECT_TREE:
369 370 371 372 373 374
		{
			git_tree *tree = (git_tree *) object;
			git_tree_entry *entry;
			size_t i;

			git_array_foreach(tree->entries, i, entry)
375 376
				if (add_expected_oid(idx, entry->oid) < 0)
					goto out;
377 378 379

			break;
		}
380
		case GIT_OBJECT_COMMIT:
381 382 383 384 385 386
		{
			git_commit *commit = (git_commit *) object;
			git_oid *parent_oid;
			size_t i;

			git_array_foreach(commit->parent_ids, i, parent_oid)
387 388
				if (add_expected_oid(idx, parent_oid) < 0)
					goto out;
389

390 391
			if (add_expected_oid(idx, &commit->tree_id) < 0)
				goto out;
392 393 394

			break;
		}
395
		case GIT_OBJECT_TAG:
396 397 398
		{
			git_tag *tag = (git_tag *) object;

399 400
			if (add_expected_oid(idx, &tag->target) < 0)
				goto out;
401 402 403

			break;
		}
404
		case GIT_OBJECT_BLOB:
405 406 407 408 409 410 411 412 413 414
		default:
			break;
	}

out:
	git_object_free(object);

	return error;
}

415
static int store_object(git_indexer *idx)
416
{
417
	int i, error;
418 419
	git_oid oid;
	struct entry *entry;
420
	off64_t entry_size;
421
	struct git_pack_entry *pentry;
422
	off64_t entry_start = idx->entry_start;
423 424

	entry = git__calloc(1, sizeof(*entry));
425
	GIT_ERROR_CHECK_ALLOC(entry);
426

Linquize committed
427
	pentry = git__calloc(1, sizeof(struct git_pack_entry));
428
	GIT_ERROR_CHECK_ALLOC(pentry);
429

430
	git_hash_final(&oid, &idx->hash_ctx);
431 432 433 434 435 436 437 438
	entry_size = idx->off - entry_start;
	if (entry_start > UINT31_MAX) {
		entry->offset = UINT32_MAX;
		entry->offset_long = entry_start;
	} else {
		entry->offset = (uint32_t)entry_start;
	}

439 440 441 442 443 444 445 446 447 448 449
	if (idx->do_verify) {
		git_rawobj rawobj = {
		    idx->entry_data.ptr,
		    idx->entry_data.size,
		    idx->entry_type
		};

		if ((error = check_object_connectivity(idx, &rawobj)) < 0)
			goto on_error;
	}

450 451
	git_oid_cpy(&pentry->sha1, &oid);
	pentry->offset = entry_start;
452

453 454
	if (git_oidmap_exists(idx->pack->idx_cache, &pentry->sha1)) {
		git_error_set(GIT_ERROR_INDEXER, "duplicate object %s found in pack", git_oid_tostr_s(&pentry->sha1));
455
		git__free(pentry);
456
		goto on_error;
457
	}
458

459
	if ((error = git_oidmap_set(idx->pack->idx_cache, &pentry->sha1, pentry)) < 0) {
460
		git__free(pentry);
461
		git_error_set_oom();
462 463 464
		goto on_error;
	}

465 466
	git_oid_cpy(&entry->oid, &oid);

467
	if (crc_object(&entry->crc, &idx->pack->mwf, entry_start, entry_size) < 0)
468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485
		goto on_error;

	/* Add the object to the list */
	if (git_vector_insert(&idx->objects, entry) < 0)
		goto on_error;

	for (i = oid.id[0]; i < 256; ++i) {
		idx->fanout[i]++;
	}

	return 0;

on_error:
	git__free(entry);

	return -1;
}

486 487
GIT_INLINE(bool) has_entry(git_indexer *idx, git_oid *id)
{
488
	return git_oidmap_exists(idx->pack->idx_cache, id);
489 490
}

491
static int save_entry(git_indexer *idx, struct entry *entry, struct git_pack_entry *pentry, off64_t entry_start)
492
{
493
	int i;
494 495 496 497 498 499 500 501

	if (entry_start > UINT31_MAX) {
		entry->offset = UINT32_MAX;
		entry->offset_long = entry_start;
	} else {
		entry->offset = (uint32_t)entry_start;
	}

502
	pentry->offset = entry_start;
503

504 505
	if (git_oidmap_exists(idx->pack->idx_cache, &pentry->sha1) ||
	    git_oidmap_set(idx->pack->idx_cache, &pentry->sha1, pentry) < 0) {
506
		git_error_set(GIT_ERROR_INDEXER, "cannot insert object into pack");
507
		return -1;
508
	}
509 510 511 512 513 514 515 516 517 518 519 520

	/* Add the object to the list */
	if (git_vector_insert(&idx->objects, entry) < 0)
		return -1;

	for (i = entry->oid.id[0]; i < 256; ++i) {
		idx->fanout[i]++;
	}

	return 0;
}

521
static int hash_and_save(git_indexer *idx, git_rawobj *obj, off64_t entry_start)
522 523 524 525
{
	git_oid oid;
	size_t entry_size;
	struct entry *entry;
526
	struct git_pack_entry *pentry = NULL;
527 528

	entry = git__calloc(1, sizeof(*entry));
529
	GIT_ERROR_CHECK_ALLOC(entry);
530

531
	if (git_odb__hashobj(&oid, obj) < 0) {
532
		git_error_set(GIT_ERROR_INDEXER, "failed to hash object");
533
		goto on_error;
534 535
	}

Linquize committed
536
	pentry = git__calloc(1, sizeof(struct git_pack_entry));
537
	GIT_ERROR_CHECK_ALLOC(pentry);
538 539 540 541 542 543

	git_oid_cpy(&pentry->sha1, &oid);
	git_oid_cpy(&entry->oid, &oid);
	entry->crc = crc32(0L, Z_NULL, 0);

	entry_size = (size_t)(idx->off - entry_start);
544
	if (crc_object(&entry->crc, &idx->pack->mwf, entry_start, entry_size) < 0)
545 546
		goto on_error;

547
	return save_entry(idx, entry, pentry, entry_start);
548

549
on_error:
550
	git__free(pentry);
551 552
	git__free(entry);
	git__free(obj->data);
553 554
	return -1;
}
555

556
static int do_progress_callback(git_indexer *idx, git_indexer_progress *stats)
557
{
558
	if (idx->progress_cb)
559
		return git_error_set_after_callback_function(
560 561
			idx->progress_cb(stats, idx->progress_payload),
			"indexer progress");
562
	return 0;
563 564
}

565
/* Hash everything but the last 20B of input */
566
static void hash_partially(git_indexer *idx, const uint8_t *data, size_t size)
567
{
568
	size_t to_expell, to_keep;
569 570 571 572 573

	if (size == 0)
		return;

	/* Easy case, dump the buffer and the data minus the last 20 bytes */
574
	if (size >= GIT_OID_RAWSZ) {
575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591
		git_hash_update(&idx->trailer, idx->inbuf, idx->inbuf_len);
		git_hash_update(&idx->trailer, data, size - GIT_OID_RAWSZ);

		data += size - GIT_OID_RAWSZ;
		memcpy(idx->inbuf, data, GIT_OID_RAWSZ);
		idx->inbuf_len = GIT_OID_RAWSZ;
		return;
	}

	/* We can just append */
	if (idx->inbuf_len + size <= GIT_OID_RAWSZ) {
		memcpy(idx->inbuf + idx->inbuf_len, data, size);
		idx->inbuf_len += size;
		return;
	}

	/* We need to partially drain the buffer and then append */
592 593
	to_keep   = GIT_OID_RAWSZ - size;
	to_expell = idx->inbuf_len - to_keep;
594 595 596 597 598 599 600 601

	git_hash_update(&idx->trailer, idx->inbuf, to_expell);

	memmove(idx->inbuf, idx->inbuf + to_expell, to_keep);
	memcpy(idx->inbuf + to_keep, data, size);
	idx->inbuf_len += size - to_expell;
}

602
static int write_at(git_indexer *idx, const void *data, off64_t offset, size_t size)
603 604
{
	git_file fd = idx->pack->mwf.fd;
605
	size_t mmap_alignment;
606
	size_t page_offset;
607
	off64_t page_start;
608
	unsigned char *map_data;
609 610 611
	git_map map;
	int error;

612 613
	assert(data && size);

614
	if ((error = git__mmap_alignment(&mmap_alignment)) < 0)
615 616
		return error;

617 618
	/* the offset needs to be at the mmap boundary for the platform */
	page_offset = offset % mmap_alignment;
619
	page_start = offset - page_offset;
620 621 622 623

	if ((error = p_mmap(&map, page_offset + size, GIT_PROT_WRITE, GIT_MAP_SHARED, fd, page_start)) < 0)
		return error;

624 625
	map_data = (unsigned char *)map.data;
	memcpy(map_data + page_offset, data, size);
626 627 628 629 630 631 632
	p_munmap(&map);

	return 0;
}

static int append_to_pack(git_indexer *idx, const void *data, size_t size)
{
633
	off64_t new_size;
634 635
	size_t mmap_alignment;
	size_t page_offset;
636 637
	off64_t page_start;
	off64_t current_size = idx->pack->mwf.size;
638
	int fd = idx->pack->mwf.fd;
639
	int error;
640

641 642 643
	if (!size)
		return 0;

644 645 646 647 648 649 650 651 652 653 654 655
	if ((error = git__mmap_alignment(&mmap_alignment)) < 0)
		return error;

	/* Write a single byte to force the file system to allocate space now or
	 * report an error, since we can't report errors when writing using mmap.
	 * Round the size up to the nearest page so that we only need to perform file
	 * I/O when we add a page, instead of whenever we write even a single byte. */
	new_size = current_size + size;
	page_offset = new_size % mmap_alignment;
	page_start = new_size - page_offset;

	if (p_lseek(fd, page_start + mmap_alignment - 1, SEEK_SET) < 0 ||
656
	    p_write(idx->pack->mwf.fd, data, 1) < 0) {
657
		git_error_set(GIT_ERROR_OS, "cannot extend packfile '%s'", idx->pack->pack_name);
658 659 660 661 662 663
		return -1;
	}

	return write_at(idx, data, idx->pack->mwf.size, size);
}

664
static int read_stream_object(git_indexer *idx, git_indexer_progress *stats)
665 666
{
	git_packfile_stream *stream = &idx->stream;
667
	off64_t entry_start = idx->off;
668
	size_t entry_size;
669
	git_object_t type;
670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687
	git_mwindow *w = NULL;
	int error;

	if (idx->pack->mwf.size <= idx->off + 20)
		return GIT_EBUFS;

	if (!idx->have_stream) {
		error = git_packfile_unpack_header(&entry_size, &type, &idx->pack->mwf, &w, &idx->off);
		if (error == GIT_EBUFS) {
			idx->off = entry_start;
			return error;
		}
		if (error < 0)
			return error;

		git_mwindow_close(&w);
		idx->entry_start = entry_start;
		git_hash_init(&idx->hash_ctx);
688
		git_buf_clear(&idx->entry_data);
689

690
		if (type == GIT_OBJECT_REF_DELTA || type == GIT_OBJECT_OFS_DELTA) {
691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708
			error = advance_delta_offset(idx, type);
			if (error == GIT_EBUFS) {
				idx->off = entry_start;
				return error;
			}
			if (error < 0)
				return error;

			idx->have_delta = 1;
		} else {
			idx->have_delta = 0;

			error = hash_header(&idx->hash_ctx, entry_size, type);
			if (error < 0)
				return error;
		}

		idx->have_stream = 1;
709
		idx->entry_type = type;
710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752

		error = git_packfile_stream_open(stream, idx->pack, idx->off);
		if (error < 0)
			return error;
	}

	if (idx->have_delta) {
		error = read_object_stream(idx, stream);
	} else {
		error = hash_object_stream(idx, stream);
	}

	idx->off = stream->curpos;
	if (error == GIT_EBUFS)
		return error;

	/* We want to free the stream reasorces no matter what here */
	idx->have_stream = 0;
	git_packfile_stream_dispose(stream);

	if (error < 0)
		return error;

	if (idx->have_delta) {
		error = store_delta(idx);
	} else {
		error = store_object(idx);
	}

	if (error < 0)
		return error;

	if (!idx->have_delta) {
		stats->indexed_objects++;
	}
	stats->received_objects++;

	if ((error = do_progress_callback(idx, stats)) != 0)
		return error;

	return 0;
}

753
int git_indexer_append(git_indexer *idx, const void *data, size_t size, git_indexer_progress *stats)
754
{
755
	int error = -1;
756
	struct git_pack_header *hdr = &idx->hdr;
757
	git_mwindow_file *mwf = &idx->pack->mwf;
758

759 760
	assert(idx && data && stats);

761
	if ((error = append_to_pack(idx, data, size)) < 0)
762
		return error;
763

nulltoken committed
764
	hash_partially(idx, data, (int)size);
765

766
	/* Make sure we set the new size of the pack */
767
	idx->pack->mwf.size += size;
768 769

	if (!idx->parsed_header) {
770 771
		unsigned int total_objects;

772
		if ((unsigned)idx->pack->mwf.size < sizeof(struct git_pack_header))
773 774
			return 0;

775 776
		if ((error = parse_header(&idx->hdr, idx->pack)) < 0)
			return error;
777 778

		idx->parsed_header = 1;
779
		idx->nr_objects = ntohl(hdr->hdr_entries);
780 781
		idx->off = sizeof(struct git_pack_header);

782
		if (idx->nr_objects <= git_indexer__max_objects) {
783
			total_objects = (unsigned int)idx->nr_objects;
784
		} else {
785
			git_error_set(GIT_ERROR_INDEXER, "too many objects");
786
			return -1;
787
		}
788

789 790
		if (git_oidmap_new(&idx->pack->idx_cache) < 0)
			return -1;
791 792

		idx->pack->has_cache = 1;
793
		if (git_vector_init(&idx->objects, total_objects, objects_cmp) < 0)
794 795
			return -1;

796
		if (git_vector_init(&idx->deltas, total_objects / 2, NULL) < 0)
797 798
			return -1;

799
		stats->received_objects = 0;
800
		stats->local_objects = 0;
801 802
		stats->total_deltas = 0;
		stats->indexed_deltas = 0;
803
		stats->indexed_objects = 0;
804
		stats->total_objects = total_objects;
805

806
		if ((error = do_progress_callback(idx, stats)) != 0)
807
			return error;
808 809 810 811 812 813
	}

	/* Now that we have data in the pack, let's try to parse it */

	/* As the file grows any windows we try to use will be out of date */
	git_mwindow_free_all(mwf);
814

815
	while (stats->indexed_objects < idx->nr_objects) {
816 817 818 819
		if ((error = read_stream_object(idx, stats)) != 0) {
			if (error == GIT_EBUFS)
				break;
			else
820
				goto on_error;
821
		}
822
	}
823

824
	return 0;
825

826 827
on_error:
	git_mwindow_free_all(mwf);
828
	return error;
829
}
830

831
static int index_path(git_buf *path, git_indexer *idx, const char *suffix)
832 833 834
{
	const char prefix[] = "pack-";
	size_t slash = (size_t)path->size;
835

836 837 838
	/* search backwards for '/' */
	while (slash > 0 && path->ptr[slash - 1] != '/')
		slash--;
839

840 841 842 843 844 845
	if (git_buf_grow(path, slash + 1 + strlen(prefix) +
					 GIT_OID_HEXSZ + strlen(suffix) + 1) < 0)
		return -1;

	git_buf_truncate(path, slash);
	git_buf_puts(path, prefix);
nulltoken committed
846
	git_oid_fmt(path->ptr + git_buf_len(path), &idx->hash);
847 848 849 850 851 852
	path->size += GIT_OID_HEXSZ;
	git_buf_puts(path, suffix);

	return git_buf_oom(path) ? -1 : 0;
}

853 854 855 856
/**
 * Rewind the packfile by the trailer, as we might need to fix the
 * packfile by injecting objects at the tail and must overwrite it.
 */
857
static void seek_back_trailer(git_indexer *idx)
858 859 860 861 862
{
	idx->pack->mwf.size -= GIT_OID_RAWSZ;
	git_mwindow_free_all(&idx->pack->mwf);
}

863
static int inject_object(git_indexer *idx, git_oid *id)
864
{
865 866
	git_odb_object *obj;
	struct entry *entry;
867
	struct git_pack_entry *pentry = NULL;
868 869 870
	git_oid foo = {{0}};
	unsigned char hdr[64];
	git_buf buf = GIT_BUF_INIT;
871
	off64_t entry_start;
872 873 874 875
	const void *data;
	size_t len, hdr_len;
	int error;

876 877
	seek_back_trailer(idx);
	entry_start = idx->pack->mwf.size;
878

879
	if (git_odb_read(&obj, idx->odb, id) < 0) {
880
		git_error_set(GIT_ERROR_INDEXER, "missing delta bases");
881
		return -1;
882
	}
883 884 885 886

	data = git_odb_object_data(obj);
	len = git_odb_object_size(obj);

887
	entry = git__calloc(1, sizeof(*entry));
888
	GIT_ERROR_CHECK_ALLOC(entry);
889

890 891 892 893
	entry->crc = crc32(0L, Z_NULL, 0);

	/* Write out the object header */
	hdr_len = git_packfile__object_header(hdr, len, git_odb_object_type(obj));
894 895 896
	if ((error = append_to_pack(idx, hdr, hdr_len)) < 0)
		goto cleanup;

897
	idx->pack->mwf.size += hdr_len;
898
	entry->crc = crc32(entry->crc, hdr, (uInt)hdr_len);
899

900
	if ((error = git_zstream_deflatebuf(&buf, data, len)) < 0)
901 902 903
		goto cleanup;

	/* And then the compressed object */
904 905 906
	if ((error = append_to_pack(idx, buf.ptr, buf.size)) < 0)
		goto cleanup;

907
	idx->pack->mwf.size += buf.size;
Linquize committed
908
	entry->crc = htonl(crc32(entry->crc, (unsigned char *)buf.ptr, (uInt)buf.size));
909
	git_buf_dispose(&buf);
910 911

	/* Write a fake trailer so the pack functions play ball */
912 913

	if ((error = append_to_pack(idx, &foo, GIT_OID_RAWSZ)) < 0)
914 915 916 917 918
		goto cleanup;

	idx->pack->mwf.size += GIT_OID_RAWSZ;

	pentry = git__calloc(1, sizeof(struct git_pack_entry));
919
	GIT_ERROR_CHECK_ALLOC(pentry);
920 921 922 923 924

	git_oid_cpy(&pentry->sha1, id);
	git_oid_cpy(&entry->oid, id);
	idx->off = entry_start + hdr_len + len;

925
	error = save_entry(idx, entry, pentry, entry_start);
926 927

cleanup:
928 929 930 931
	if (error) {
		git__free(entry);
		git__free(pentry);
	}
932

933 934 935 936
	git_odb_object_free(obj);
	return error;
}

937
static int fix_thin_pack(git_indexer *idx, git_indexer_progress *stats)
938
{
939
	int error, found_ref_delta = 0;
940 941
	unsigned int i;
	struct delta_info *delta;
942
	size_t size;
943
	git_object_t type;
944
	git_mwindow *w = NULL;
945
	off64_t curpos = 0;
946 947 948 949 950
	unsigned char *base_info;
	unsigned int left = 0;
	git_oid base;

	assert(git_vector_length(&idx->deltas) > 0);
951 952

	if (idx->odb == NULL) {
953
		git_error_set(GIT_ERROR_INDEXER, "cannot fix a thin pack without an ODB");
954 955
		return -1;
	}
956

957
	/* Loop until we find the first REF delta */
958
	git_vector_foreach(&idx->deltas, i, delta) {
959 960 961
		if (!delta)
			continue;

962
		curpos = delta->delta_off;
963 964 965 966
		error = git_packfile_unpack_header(&size, &type, &idx->pack->mwf, &w, &curpos);
		if (error < 0)
			return error;

967
		if (type == GIT_OBJECT_REF_DELTA) {
968 969
			found_ref_delta = 1;
			break;
970
		}
971
	}
972

973
	if (!found_ref_delta) {
974
		git_error_set(GIT_ERROR_INDEXER, "no REF_DELTA found, cannot inject object");
975 976
		return -1;
	}
977

978 979 980
	/* curpos now points to the base information, which is an OID */
	base_info = git_mwindow_open(&idx->pack->mwf, &w, curpos, GIT_OID_RAWSZ, &left);
	if (base_info == NULL) {
981
		git_error_set(GIT_ERROR_INDEXER, "failed to map delta information");
982 983
		return -1;
	}
984

985 986
	git_oid_fromraw(&base, base_info);
	git_mwindow_close(&w);
987

988 989 990
	if (has_entry(idx, &base))
		return 0;

991 992 993 994
	if (inject_object(idx, &base) < 0)
		return -1;

	stats->local_objects++;
995 996 997 998

	return 0;
}

999
static int resolve_deltas(git_indexer *idx, git_indexer_progress *stats)
1000 1001
{
	unsigned int i;
lhchavez committed
1002
	int error;
1003
	struct delta_info *delta;
1004
	int progressed = 0, non_null = 0, progress_cb_result;
1005 1006 1007

	while (idx->deltas.length > 0) {
		progressed = 0;
1008
		non_null = 0;
1009
		git_vector_foreach(&idx->deltas, i, delta) {
1010
			git_rawobj obj = {0};
1011

1012 1013 1014 1015
			if (!delta)
				continue;

			non_null = 1;
1016
			idx->off = delta->delta_off;
lhchavez committed
1017 1018 1019 1020 1021 1022 1023
			if ((error = git_packfile_unpack(&obj, idx->pack, &idx->off)) < 0) {
				if (error == GIT_PASSTHROUGH) {
					/* We have not seen the base object, we'll try again later. */
					continue;
				}
				return -1;
			}
1024

1025 1026 1027 1028
			if (idx->do_verify && check_object_connectivity(idx, &obj) < 0)
				/* TODO: error? continue? */
				continue;

1029 1030 1031 1032 1033
			if (hash_and_save(idx, &obj, delta->delta_off) < 0)
				continue;

			git__free(obj.data);
			stats->indexed_objects++;
1034
			stats->indexed_deltas++;
1035
			progressed = 1;
1036 1037
			if ((progress_cb_result = do_progress_callback(idx, stats)) < 0)
				return progress_cb_result;
1038

1039 1040
			/* remove from the list */
			git_vector_set(NULL, &idx->deltas, i, NULL);
1041
			git__free(delta);
1042
		}
1043

1044 1045 1046 1047
		/* if none were actually set, we're done */
		if (!non_null)
			break;

1048
		if (!progressed && (fix_thin_pack(idx, stats) < 0)) {
1049
			return -1;
1050
		}
1051 1052 1053 1054 1055
	}

	return 0;
}

1056
static int update_header_and_rehash(git_indexer *idx, git_indexer_progress *stats)
1057 1058 1059
{
	void *ptr;
	size_t chunk = 1024*1024;
1060
	off64_t hashed = 0;
1061 1062 1063 1064 1065 1066
	git_mwindow *w = NULL;
	git_mwindow_file *mwf;
	unsigned int left;

	mwf = &idx->pack->mwf;

1067
	git_hash_init(&idx->trailer);
1068

1069 1070

	/* Update the header to include the numer of local objects we injected */
1071
	idx->hdr.hdr_entries = htonl(stats->total_objects + stats->local_objects);
1072
	if (write_at(idx, &idx->hdr, 0, sizeof(struct git_pack_header)) < 0)
1073
		return -1;
1074

1075 1076 1077 1078 1079 1080
	/*
	 * We now use the same technique as before to determine the
	 * hash. We keep reading up to the end and let
	 * hash_partially() keep the existing trailer out of the
	 * calculation.
	 */
1081
	git_mwindow_free_all(mwf);
1082 1083 1084 1085
	idx->inbuf_len = 0;
	while (hashed < mwf->size) {
		ptr = git_mwindow_open(mwf, &w, hashed, chunk, &left);
		if (ptr == NULL)
1086
			return -1;
1087

1088 1089 1090 1091
		hash_partially(idx, ptr, left);
		hashed += left;

		git_mwindow_close(&w);
1092
	}
1093

1094 1095 1096
	return 0;
}

1097
int git_indexer_commit(git_indexer *idx, git_indexer_progress *stats)
1098 1099 1100
{
	git_mwindow *w = NULL;
	unsigned int i, long_offsets = 0, left;
1101
	int error;
1102 1103 1104
	struct git_pack_idx_header hdr;
	git_buf filename = GIT_BUF_INIT;
	struct entry *entry;
1105
	git_oid trailer_hash, file_hash;
1106
	git_filebuf index_file = {0};
1107
	void *packfile_trailer;
1108

1109
	if (!idx->parsed_header) {
1110
		git_error_set(GIT_ERROR_INDEXER, "incomplete pack header");
1111 1112 1113
		return -1;
	}

1114
	/* Test for this before resolve_deltas(), as it plays with idx->off */
1115
	if (idx->off + 20 < idx->pack->mwf.size) {
1116
		git_error_set(GIT_ERROR_INDEXER, "unexpected data at the end of the pack");
1117 1118
		return -1;
	}
1119
	if (idx->off + 20 > idx->pack->mwf.size) {
1120
		git_error_set(GIT_ERROR_INDEXER, "missing trailer at the end of the pack");
1121 1122
		return -1;
	}
1123

1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135
	packfile_trailer = git_mwindow_open(&idx->pack->mwf, &w, idx->pack->mwf.size - GIT_OID_RAWSZ, GIT_OID_RAWSZ, &left);
	if (packfile_trailer == NULL) {
		git_mwindow_close(&w);
		goto on_error;
	}

	/* Compare the packfile trailer as it was sent to us and what we calculated */
	git_oid_fromraw(&file_hash, packfile_trailer);
	git_mwindow_close(&w);

	git_hash_final(&trailer_hash, &idx->trailer);
	if (git_oid_cmp(&file_hash, &trailer_hash)) {
1136
		git_error_set(GIT_ERROR_INDEXER, "packfile trailer mismatch");
1137 1138 1139
		return -1;
	}

1140 1141 1142
	/* Freeze the number of deltas */
	stats->total_deltas = stats->total_objects - stats->indexed_objects;

1143 1144
	if ((error = resolve_deltas(idx, stats)) < 0)
		return error;
1145

1146
	if (stats->indexed_objects != stats->total_objects) {
1147
		git_error_set(GIT_ERROR_INDEXER, "early EOF");
1148 1149 1150
		return -1;
	}

1151 1152 1153 1154 1155
	if (stats->local_objects > 0) {
		if (update_header_and_rehash(idx, stats) < 0)
			return -1;

		git_hash_final(&trailer_hash, &idx->trailer);
1156
		write_at(idx, &trailer_hash, idx->pack->mwf.size - GIT_OID_RAWSZ, GIT_OID_RAWSZ);
1157 1158
	}

1159 1160 1161 1162 1163 1164 1165
	/*
	 * Is the resulting graph fully connected or are we still
	 * missing some objects? In the second case, we can
	 * bail out due to an incomplete and thus corrupt
	 * packfile.
	 */
	if (git_oidmap_size(idx->expected_oids) > 0) {
1166
		git_error_set(GIT_ERROR_INDEXER, "packfile is missing %"PRIuZ" objects",
1167 1168 1169 1170
			git_oidmap_size(idx->expected_oids));
		return -1;
	}

1171 1172
	git_vector_sort(&idx->objects);

1173 1174 1175 1176
	/* Use the trailer hash as the pack file name to ensure
	 * files with different contents have different names */
	git_oid_cpy(&idx->hash, &trailer_hash);

1177
	git_buf_sets(&filename, idx->pack->pack_name);
1178
	git_buf_shorten(&filename, strlen("pack"));
1179 1180 1181 1182
	git_buf_puts(&filename, "idx");
	if (git_buf_oom(&filename))
		return -1;

1183
	if (git_filebuf_open(&index_file, filename.ptr,
1184
		GIT_FILEBUF_HASH_CONTENTS |
1185
		(idx->do_fsync ? GIT_FILEBUF_FSYNC : 0),
1186
		idx->mode) < 0)
1187 1188 1189 1190 1191
		goto on_error;

	/* Write out the header */
	hdr.idx_signature = htonl(PACK_IDX_SIGNATURE);
	hdr.idx_version = htonl(2);
1192
	git_filebuf_write(&index_file, &hdr, sizeof(hdr));
1193 1194 1195 1196

	/* Write out the fanout table */
	for (i = 0; i < 256; ++i) {
		uint32_t n = htonl(idx->fanout[i]);
1197
		git_filebuf_write(&index_file, &n, sizeof(n));
1198 1199
	}

1200 1201
	/* Write out the object names (SHA-1 hashes) */
	git_vector_foreach(&idx->objects, i, entry) {
1202
		git_filebuf_write(&index_file, &entry->oid, sizeof(git_oid));
1203 1204 1205 1206
	}

	/* Write out the CRC32 values */
	git_vector_foreach(&idx->objects, i, entry) {
1207
		git_filebuf_write(&index_file, &entry->crc, sizeof(uint32_t));
1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218
	}

	/* Write out the offsets */
	git_vector_foreach(&idx->objects, i, entry) {
		uint32_t n;

		if (entry->offset == UINT32_MAX)
			n = htonl(0x80000000 | long_offsets++);
		else
			n = htonl(entry->offset);

1219
		git_filebuf_write(&index_file, &n, sizeof(uint32_t));
1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231
	}

	/* Write out the long offsets */
	git_vector_foreach(&idx->objects, i, entry) {
		uint32_t split[2];

		if (entry->offset != UINT32_MAX)
			continue;

		split[0] = htonl(entry->offset_long >> 32);
		split[1] = htonl(entry->offset_long & 0xffffffff);

1232
		git_filebuf_write(&index_file, &split, sizeof(uint32_t) * 2);
1233 1234
	}

1235 1236
	/* Write out the packfile trailer to the index */
	if (git_filebuf_write(&index_file, &trailer_hash, GIT_OID_RAWSZ) < 0)
1237 1238
		goto on_error;

1239 1240
	/* Write out the hash of the idx */
	if (git_filebuf_hash(&trailer_hash, &index_file) < 0)
1241 1242
		goto on_error;

1243
	git_filebuf_write(&index_file, &trailer_hash, sizeof(git_oid));
1244 1245

	/* Figure out what the final name should be */
1246
	if (index_path(&filename, idx, ".idx") < 0)
1247 1248 1249
		goto on_error;

	/* Commit file */
1250
	if (git_filebuf_commit_at(&index_file, filename.ptr) < 0)
1251 1252 1253
		goto on_error;

	git_mwindow_free_all(&idx->pack->mwf);
1254 1255 1256

	/* Truncate file to undo rounding up to next page_size in append_to_pack */
	if (p_ftruncate(idx->pack->mwf.fd, idx->pack->mwf.size) < 0) {
1257
		git_error_set(GIT_ERROR_OS, "failed to truncate pack file '%s'", idx->pack->pack_name);
1258 1259 1260
		return -1;
	}

1261
	if (idx->do_fsync && p_fsync(idx->pack->mwf.fd) < 0) {
1262
		git_error_set(GIT_ERROR_OS, "failed to fsync packfile");
1263 1264 1265
		goto on_error;
	}

1266
	/* We need to close the descriptor here so Windows doesn't choke on commit_at */
1267
	if (p_close(idx->pack->mwf.fd) < 0) {
1268
		git_error_set(GIT_ERROR_OS, "failed to close packfile");
1269 1270 1271
		goto on_error;
	}

1272
	idx->pack->mwf.fd = -1;
1273

1274
	if (index_path(&filename, idx, ".pack") < 0)
1275
		goto on_error;
1276

1277
	/* And don't forget to rename the packfile to its new place. */
1278 1279 1280 1281
	if (p_rename(idx->pack->pack_name, git_buf_cstr(&filename)) < 0)
		goto on_error;

	/* And fsync the parent directory if we're asked to. */
1282
	if (idx->do_fsync &&
1283 1284 1285
		git_futils_fsync_parent(git_buf_cstr(&filename)) < 0)
		goto on_error;

1286
	idx->pack_committed = 1;
1287

1288
	git_buf_dispose(&filename);
1289 1290 1291
	return 0;

on_error:
1292
	git_mwindow_free_all(&idx->pack->mwf);
1293
	git_filebuf_cleanup(&index_file);
1294
	git_buf_dispose(&filename);
1295 1296 1297
	return -1;
}

1298
void git_indexer_free(git_indexer *idx)
1299
{
1300 1301 1302
	const git_oid *key;
	git_oid *value;
	size_t iter;
1303

1304 1305 1306
	if (idx == NULL)
		return;

1307
	if (idx->have_stream)
1308
		git_packfile_stream_dispose(&idx->stream);
1309

1310
	git_vector_free_deep(&idx->objects);
1311

1312
	if (idx->pack->idx_cache) {
Russell Belfer committed
1313
		struct git_pack_entry *pentry;
1314 1315 1316
		git_oidmap_foreach_value(idx->pack->idx_cache, pentry, {
			git__free(pentry);
		});
1317 1318

		git_oidmap_free(idx->pack->idx_cache);
1319
	}
1320

1321
	git_vector_free_deep(&idx->deltas);
1322 1323

	if (!git_mutex_lock(&git__mwindow_mutex)) {
1324 1325 1326
		if (!idx->pack_committed)
			git_packfile_close(idx->pack, true);

1327 1328 1329 1330
		git_packfile_free(idx->pack);
		git_mutex_unlock(&git__mwindow_mutex);
	}

1331 1332 1333
	iter = 0;
	while (git_oidmap_iterate((void **) &value, idx->expected_oids, &iter, &key) == 0)
		git__free(value);
1334

1335 1336
	git_hash_ctx_cleanup(&idx->trailer);
	git_hash_ctx_cleanup(&idx->hash_ctx);
1337 1338
	git_buf_dispose(&idx->entry_data);
	git_oidmap_free(idx->expected_oids);
1339 1340
	git__free(idx);
}