indexer.c 24.1 KB
Newer Older
1
/*
schu committed
2
 * Copyright (C) 2009-2012 the libgit2 contributors
3
 *
Vicent Marti committed
4 5
 * This file is part of libgit2, distributed under the GNU GPL v2 with
 * a Linking Exception. For full terms see the included COPYING file.
6 7
 */

8 9
#include <zlib.h>

Carlos Martín Nieto committed
10
#include "git2/indexer.h"
11
#include "git2/object.h"
12
#include "git2/oid.h"
Carlos Martín Nieto committed
13

14 15
#include "common.h"
#include "pack.h"
Carlos Martín Nieto committed
16
#include "mwindow.h"
17
#include "posix.h"
18 19 20 21
#include "pack.h"
#include "filebuf.h"

#define UINT31_MAX (0x7FFFFFFF)
22

23
struct entry {
24
	git_oid oid;
25 26 27 28 29
	uint32_t crc;
	uint32_t offset;
	uint64_t offset_long;
};

30
struct git_indexer {
31
	struct git_pack_file *pack;
32 33 34 35 36
	size_t nr_objects;
	git_vector objects;
	git_filebuf file;
	unsigned int fanout[256];
	git_oid hash;
37
};
Carlos Martín Nieto committed
38

39 40
struct git_indexer_stream {
	unsigned int parsed_header :1,
41
		opened_pack :1,
42 43
		have_stream :1,
		have_delta :1;
44 45 46 47
	struct git_pack_file *pack;
	git_filebuf pack_file;
	git_filebuf index_file;
	git_off_t off;
48 49
	git_off_t entry_start;
	git_packfile_stream stream;
50 51 52 53
	size_t nr_objects;
	git_vector objects;
	git_vector deltas;
	unsigned int fanout[256];
54
	git_hash_ctx hash_ctx;
55
	git_oid hash;
56
	git_transfer_progress_callback progress_cb;
57
	void *progress_payload;
58
	char objbuf[8*1024];
59 60 61
};

struct delta_info {
62
	git_off_t delta_off;
63 64
};

Ben Straub committed
65
const git_oid *git_indexer_hash(const git_indexer *idx)
66 67 68 69
{
	return &idx->hash;
}

Ben Straub committed
70
const git_oid *git_indexer_stream_hash(const git_indexer_stream *idx)
71 72 73 74
{
	return &idx->hash;
}

75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109
static int open_pack(struct git_pack_file **out, const char *filename)
{
	size_t namelen;
	struct git_pack_file *pack;
	struct stat st;
	int fd;

	namelen = strlen(filename);
	pack = git__calloc(1, sizeof(struct git_pack_file) + namelen + 1);
	GITERR_CHECK_ALLOC(pack);

	memcpy(pack->pack_name, filename, namelen + 1);

	if (p_stat(filename, &st) < 0) {
		giterr_set(GITERR_OS, "Failed to stat packfile.");
		goto cleanup;
	}

	if ((fd = p_open(pack->pack_name, O_RDONLY)) < 0) {
		giterr_set(GITERR_OS, "Failed to open packfile.");
		goto cleanup;
	}

	pack->mwf.fd = fd;
	pack->mwf.size = (git_off_t)st.st_size;

	*out = pack;
	return 0;

cleanup:
	git__free(pack);
	return -1;
}

static int parse_header(struct git_pack_header *hdr, struct git_pack_file *pack)
110 111 112 113
{
	int error;

	/* Verify we recognize this pack file format. */
114
	if ((error = p_read(pack->mwf.fd, hdr, sizeof(*hdr))) < 0) {
115 116 117
		giterr_set(GITERR_OS, "Failed to read in pack header");
		return error;
	}
118

119
	if (hdr->hdr_signature != ntohl(PACK_SIGNATURE)) {
120
		giterr_set(GITERR_INDEXER, "Wrong pack signature");
121 122
		return -1;
	}
123

124
	if (!pack_version_ok(hdr->hdr_version)) {
125
		giterr_set(GITERR_INDEXER, "Wrong pack version");
126 127
		return -1;
	}
Carlos Martín Nieto committed
128

129
	return 0;
130 131
}

132
static int objects_cmp(const void *a, const void *b)
133 134 135 136 137 138 139
{
	const struct entry *entrya = a;
	const struct entry *entryb = b;

	return git_oid_cmp(&entrya->oid, &entryb->oid);
}

140 141 142 143 144 145 146 147
static int cache_cmp(const void *a, const void *b)
{
	const struct git_pack_entry *ea = a;
	const struct git_pack_entry *eb = b;

	return git_oid_cmp(&ea->sha1, &eb->sha1);
}

148 149 150
int git_indexer_stream_new(
		git_indexer_stream **out,
		const char *prefix,
151
		git_transfer_progress_callback progress_cb,
152
		void *progress_payload)
153 154 155
{
	git_indexer_stream *idx;
	git_buf path = GIT_BUF_INIT;
156
	static const char suff[] = "/pack";
157
	int error;
158

159 160
	idx = git__calloc(1, sizeof(git_indexer_stream));
	GITERR_CHECK_ALLOC(idx);
161 162
	idx->progress_cb = progress_cb;
	idx->progress_payload = progress_payload;
163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184

	error = git_buf_joinpath(&path, prefix, suff);
	if (error < 0)
		goto cleanup;

	error = git_filebuf_open(&idx->pack_file, path.ptr,
				 GIT_FILEBUF_TEMPORARY | GIT_FILEBUF_DO_NOT_BUFFER);
	git_buf_free(&path);
	if (error < 0)
		goto cleanup;

	*out = idx;
	return 0;

cleanup:
	git_buf_free(&path);
	git_filebuf_cleanup(&idx->pack_file);
	git__free(idx);
	return -1;
}

/* Try to store the delta so we can try to resolve it later */
185
static int store_delta(git_indexer_stream *idx)
186
{
187 188
	struct delta_info *delta;

189 190
	delta = git__calloc(1, sizeof(struct delta_info));
	GITERR_CHECK_ALLOC(delta);
191
	delta->delta_off = idx->entry_start;
192

193
	if (git_vector_insert(&idx->deltas, delta) < 0)
194 195 196 197 198
		return -1;

	return 0;
}

199 200 201 202 203
static void hash_header(git_hash_ctx *ctx, git_off_t len, git_otype type)
{
	char buffer[64];
	size_t hdrlen;

204
	hdrlen = git_odb__format_object_header(buffer, sizeof(buffer), (size_t)len, type);
205 206 207
	git_hash_update(ctx, buffer, hdrlen);
}

208
static int hash_object_stream(git_indexer_stream *idx, git_packfile_stream *stream)
209 210 211
{
	ssize_t read;

212
	assert(idx && stream);
213 214

	do {
215
		if ((read = git_packfile_stream_read(stream, idx->objbuf, sizeof(idx->objbuf))) < 0)
216 217
			break;

218
		git_hash_update(&idx->hash_ctx, idx->objbuf, read);
219 220 221 222 223 224 225 226
	} while (read > 0);

	if (read < 0)
		return (int)read;

	return 0;
}

227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246
/* In order to create the packfile stream, we need to skip over the delta base description */
static int advance_delta_offset(git_indexer_stream *idx, git_otype type)
{
	git_mwindow *w = NULL;

	assert(type == GIT_OBJ_REF_DELTA || type == GIT_OBJ_OFS_DELTA);

	if (type == GIT_OBJ_REF_DELTA) {
		idx->off += GIT_OID_RAWSZ;
	} else {
		git_off_t base_off = get_delta_base(idx->pack, &w, &idx->off, type, idx->entry_start);
		git_mwindow_close(&w);
		if (base_off < 0)
			return (int)base_off;
	}

	return 0;
}

/* Read from the stream and discard any output */
247
static int read_object_stream(git_indexer_stream *idx, git_packfile_stream *stream)
248 249 250 251 252 253
{
	ssize_t read;

	assert(stream);

	do {
254
		read = git_packfile_stream_read(stream, idx->objbuf, sizeof(idx->objbuf));
255 256 257 258 259 260 261 262
	} while (read > 0);

	if (read < 0)
		return (int)read;

	return 0;
}

263 264 265 266 267 268 269 270 271
static int crc_object(uint32_t *crc_out, git_mwindow_file *mwf, git_off_t start, git_off_t size)
{
	void *ptr;
	uint32_t crc;
	unsigned int left, len;
	git_mwindow *w = NULL;

	crc = crc32(0L, Z_NULL, 0);
	while (size) {
272
		ptr = git_mwindow_open(mwf, &w, start, (size_t)size, &left);
273 274 275
		if (ptr == NULL)
			return -1;

276
		len = min(left, (size_t)size);
277 278 279 280 281 282 283 284 285 286
		crc = crc32(crc, ptr, len);
		size -= len;
		start += len;
		git_mwindow_close(&w);
	}

	*crc_out = htonl(crc);
	return 0;
}

287
static int store_object(git_indexer_stream *idx)
288 289 290 291 292 293
{
	int i;
	git_oid oid;
	struct entry *entry;
	git_off_t entry_size;
	struct git_pack_entry *pentry;
294 295
	git_hash_ctx *ctx = &idx->hash_ctx;
	git_off_t entry_start = idx->entry_start;
296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313

	entry = git__calloc(1, sizeof(*entry));
	GITERR_CHECK_ALLOC(entry);

	pentry = git__malloc(sizeof(struct git_pack_entry));
	GITERR_CHECK_ALLOC(pentry);

	git_hash_final(&oid, ctx);
	entry_size = idx->off - entry_start;
	if (entry_start > UINT31_MAX) {
		entry->offset = UINT32_MAX;
		entry->offset_long = entry_start;
	} else {
		entry->offset = (uint32_t)entry_start;
	}

	git_oid_cpy(&pentry->sha1, &oid);
	pentry->offset = entry_start;
314 315
	if (git_vector_insert(&idx->pack->cache, pentry) < 0) {
		git__free(pentry);
316
		goto on_error;
317
	}
318 319 320

	git_oid_cpy(&entry->oid, &oid);

321
	if (crc_object(&entry->crc, &idx->pack->mwf, entry_start, entry_size) < 0)
322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339
		goto on_error;

	/* Add the object to the list */
	if (git_vector_insert(&idx->objects, entry) < 0)
		goto on_error;

	for (i = oid.id[0]; i < 256; ++i) {
		idx->fanout[i]++;
	}

	return 0;

on_error:
	git__free(entry);

	return -1;
}

340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359
static int hash_and_save(git_indexer_stream *idx, git_rawobj *obj, git_off_t entry_start)
{
	int i;
	git_oid oid;
	size_t entry_size;
	struct entry *entry;
	struct git_pack_entry *pentry;

	entry = git__calloc(1, sizeof(*entry));
	GITERR_CHECK_ALLOC(entry);

	if (entry_start > UINT31_MAX) {
		entry->offset = UINT32_MAX;
		entry->offset_long = entry_start;
	} else {
		entry->offset = (uint32_t)entry_start;
	}

	/* FIXME: Parse the object instead of hashing it */
	if (git_odb__hashobj(&oid, obj) < 0) {
360
		giterr_set(GITERR_INDEXER, "Failed to hash object");
361 362 363 364 365 366 367 368
		return -1;
	}

	pentry = git__malloc(sizeof(struct git_pack_entry));
	GITERR_CHECK_ALLOC(pentry);

	git_oid_cpy(&pentry->sha1, &oid);
	pentry->offset = entry_start;
369 370
	if (git_vector_insert(&idx->pack->cache, pentry) < 0) {
		git__free(pentry);
371
		goto on_error;
372
	}
373

374 375 376 377
	git_oid_cpy(&entry->oid, &oid);
	entry->crc = crc32(0L, Z_NULL, 0);

	entry_size = (size_t)(idx->off - entry_start);
378
	if (crc_object(&entry->crc, &idx->pack->mwf, entry_start, entry_size) < 0)
379 380 381 382 383 384 385 386 387 388
		goto on_error;

	/* Add the object to the list */
	if (git_vector_insert(&idx->objects, entry) < 0)
		goto on_error;

	for (i = oid.id[0]; i < 256; ++i) {
		idx->fanout[i]++;
	}

389
	return 0;
390

391
on_error:
392 393
	git__free(entry);
	git__free(obj->data);
394 395
	return -1;
}
396

397
static void do_progress_callback(git_indexer_stream *idx, git_transfer_progress *stats)
398 399 400 401 402
{
	if (!idx->progress_cb) return;
	idx->progress_cb(stats, idx->progress_payload);
}

403
int git_indexer_stream_add(git_indexer_stream *idx, const void *data, size_t size, git_transfer_progress *stats)
404 405 406
{
	int error;
	struct git_pack_header hdr;
407
	size_t processed; 
408
	git_mwindow_file *mwf = &idx->pack->mwf;
409

410 411
	assert(idx && data && stats);

412
	processed = stats->indexed_objects;
413

414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453
	if (git_filebuf_write(&idx->pack_file, data, size) < 0)
		return -1;

	/* Make sure we set the new size of the pack */
	if (idx->opened_pack) {
		idx->pack->mwf.size += size;
		//printf("\nadding %zu for %zu\n", size, idx->pack->mwf.size);
	} else {
		if (open_pack(&idx->pack, idx->pack_file.path_lock) < 0)
			return -1;
		idx->opened_pack = 1;
		mwf = &idx->pack->mwf;
		if (git_mwindow_file_register(&idx->pack->mwf) < 0)
			return -1;
	}

	if (!idx->parsed_header) {
		if ((unsigned)idx->pack->mwf.size < sizeof(hdr))
			return 0;

		if (parse_header(&hdr, idx->pack) < 0)
			return -1;

		idx->parsed_header = 1;
		idx->nr_objects = ntohl(hdr.hdr_entries);
		idx->off = sizeof(struct git_pack_header);

		/* for now, limit to 2^32 objects */
		assert(idx->nr_objects == (size_t)((unsigned int)idx->nr_objects));

		if (git_vector_init(&idx->pack->cache, (unsigned int)idx->nr_objects, cache_cmp) < 0)
			return -1;

		idx->pack->has_cache = 1;
		if (git_vector_init(&idx->objects, (unsigned int)idx->nr_objects, objects_cmp) < 0)
			return -1;

		if (git_vector_init(&idx->deltas, (unsigned int)(idx->nr_objects / 2), NULL) < 0)
			return -1;

454 455
		stats->received_objects = 0;
		stats->indexed_objects = 0;
456
		stats->total_objects = (unsigned int)idx->nr_objects;
457
		do_progress_callback(idx, stats);
458 459 460 461 462 463 464
	}

	/* Now that we have data in the pack, let's try to parse it */

	/* As the file grows any windows we try to use will be out of date */
	git_mwindow_free_all(mwf);
	while (processed < idx->nr_objects) {
465
		git_packfile_stream *stream = &idx->stream;
466
		git_off_t entry_start = idx->off;
467 468 469
		size_t entry_size;
		git_otype type;
		git_mwindow *w = NULL;
470

471 472 473
		if (idx->pack->mwf.size <= idx->off + 20)
			return 0;

474 475
		if (!idx->have_stream) {
			error = git_packfile_unpack_header(&entry_size, &type, mwf, &w, &idx->off);
476 477
			if (error == GIT_EBUFS) {
				idx->off = entry_start;
478
				return 0;
479
			}
480
			if (error < 0)
481 482 483 484
				return -1;

			git_mwindow_close(&w);
			idx->entry_start = entry_start;
485
			git_hash_ctx_init(&idx->hash_ctx);
486 487

			if (type == GIT_OBJ_REF_DELTA || type == GIT_OBJ_OFS_DELTA) {
488
				error = advance_delta_offset(idx, type);
489 490 491 492 493
				if (error == GIT_EBUFS) {
					idx->off = entry_start;
					return 0;
				}
				if (error < 0)
494
					return -1;
495

496 497 498 499
				idx->have_delta = 1;
			} else {
				idx->have_delta = 0;
				hash_header(&idx->hash_ctx, entry_size, type);
500
			}
501

502 503 504
			idx->have_stream = 1;
			if (git_packfile_stream_open(stream, idx->pack, idx->off) < 0)
				goto on_error;
505 506 507 508

		}

		if (idx->have_delta) {
509
			error = read_object_stream(idx, stream);
510
		} else {
511
			error = hash_object_stream(idx, stream);
512 513
		}

514
		idx->off = stream->curpos;
515
		if (error == GIT_EBUFS)
516
			return 0;
517 518 519 520 521

		/* We want to free the stream reasorces no matter what here */
		idx->have_stream = 0;
		git_packfile_stream_free(stream);

522
		if (error < 0)
523 524
			goto on_error;

525 526 527 528 529 530 531
		if (idx->have_delta) {
			error = store_delta(idx);
		} else {
			error = store_object(idx);
		}

		if (error < 0)
532
			goto on_error;
533

534 535 536
		if (!idx->have_delta) {
			stats->indexed_objects = (unsigned int)++processed;
		}
537
		stats->received_objects++;
538

539
		do_progress_callback(idx, stats);
540
	}
541

542
	return 0;
543

544 545 546 547
on_error:
	git_mwindow_free_all(mwf);
	return -1;
}
548

549 550 551 552
static int index_path_stream(git_buf *path, git_indexer_stream *idx, const char *suffix)
{
	const char prefix[] = "pack-";
	size_t slash = (size_t)path->size;
553

554 555 556
	/* search backwards for '/' */
	while (slash > 0 && path->ptr[slash - 1] != '/')
		slash--;
557

558 559 560 561 562 563
	if (git_buf_grow(path, slash + 1 + strlen(prefix) +
					 GIT_OID_HEXSZ + strlen(suffix) + 1) < 0)
		return -1;

	git_buf_truncate(path, slash);
	git_buf_puts(path, prefix);
nulltoken committed
564
	git_oid_fmt(path->ptr + git_buf_len(path), &idx->hash);
565 566 567 568 569 570
	path->size += GIT_OID_HEXSZ;
	git_buf_puts(path, suffix);

	return git_buf_oom(path) ? -1 : 0;
}

571
static int resolve_deltas(git_indexer_stream *idx, git_transfer_progress *stats)
572 573 574 575 576 577 578 579 580 581 582 583 584
{
	unsigned int i;
	struct delta_info *delta;

	git_vector_foreach(&idx->deltas, i, delta) {
		git_rawobj obj;

		idx->off = delta->delta_off;
		if (git_packfile_unpack(&obj, idx->pack, &idx->off) < 0)
			return -1;

		if (hash_and_save(idx, &obj, delta->delta_off) < 0)
			return -1;
585 586

		git__free(obj.data);
587
		stats->indexed_objects++;
588
		do_progress_callback(idx, stats);
589
	}
590

591 592 593
	return 0;
}

594
int git_indexer_stream_finalize(git_indexer_stream *idx, git_transfer_progress *stats)
595 596 597 598 599 600 601 602
{
	git_mwindow *w = NULL;
	unsigned int i, long_offsets = 0, left;
	struct git_pack_idx_header hdr;
	git_buf filename = GIT_BUF_INIT;
	struct entry *entry;
	void *packfile_hash;
	git_oid file_hash;
603
	git_hash_ctx ctx;
604

605 606
	if (git_hash_ctx_init(&ctx) < 0)
		return -1;
607

608 609
	/* Test for this before resolve_deltas(), as it plays with idx->off */
	if (idx->off < idx->pack->mwf.size - GIT_OID_RAWSZ) {
610
		giterr_set(GITERR_INDEXER, "Indexing error: unexpected data at the end of the pack");
611 612 613
		return -1;
	}

614 615 616 617
	if (idx->deltas.length > 0)
		if (resolve_deltas(idx, stats) < 0)
			return -1;

618
	if (stats->indexed_objects != stats->total_objects) {
619 620 621 622
		giterr_set(GITERR_INDEXER, "Indexing error: early EOF");
		return -1;
	}

623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642
	git_vector_sort(&idx->objects);

	git_buf_sets(&filename, idx->pack->pack_name);
	git_buf_truncate(&filename, filename.size - strlen("pack"));
	git_buf_puts(&filename, "idx");
	if (git_buf_oom(&filename))
		return -1;

	if (git_filebuf_open(&idx->index_file, filename.ptr, GIT_FILEBUF_HASH_CONTENTS) < 0)
		goto on_error;

	/* Write out the header */
	hdr.idx_signature = htonl(PACK_IDX_SIGNATURE);
	hdr.idx_version = htonl(2);
	git_filebuf_write(&idx->index_file, &hdr, sizeof(hdr));

	/* Write out the fanout table */
	for (i = 0; i < 256; ++i) {
		uint32_t n = htonl(idx->fanout[i]);
		git_filebuf_write(&idx->index_file, &n, sizeof(n));
643 644
	}

645 646 647
	/* Write out the object names (SHA-1 hashes) */
	git_vector_foreach(&idx->objects, i, entry) {
		git_filebuf_write(&idx->index_file, &entry->oid, sizeof(git_oid));
648
		git_hash_update(&ctx, &entry->oid, GIT_OID_RAWSZ);
649
	}
650
	git_hash_final(&idx->hash, &ctx);
651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708

	/* Write out the CRC32 values */
	git_vector_foreach(&idx->objects, i, entry) {
		git_filebuf_write(&idx->index_file, &entry->crc, sizeof(uint32_t));
	}

	/* Write out the offsets */
	git_vector_foreach(&idx->objects, i, entry) {
		uint32_t n;

		if (entry->offset == UINT32_MAX)
			n = htonl(0x80000000 | long_offsets++);
		else
			n = htonl(entry->offset);

		git_filebuf_write(&idx->index_file, &n, sizeof(uint32_t));
	}

	/* Write out the long offsets */
	git_vector_foreach(&idx->objects, i, entry) {
		uint32_t split[2];

		if (entry->offset != UINT32_MAX)
			continue;

		split[0] = htonl(entry->offset_long >> 32);
		split[1] = htonl(entry->offset_long & 0xffffffff);

		git_filebuf_write(&idx->index_file, &split, sizeof(uint32_t) * 2);
	}

	/* Write out the packfile trailer */
	packfile_hash = git_mwindow_open(&idx->pack->mwf, &w, idx->pack->mwf.size - GIT_OID_RAWSZ, GIT_OID_RAWSZ, &left);
	if (packfile_hash == NULL) {
		git_mwindow_close(&w);
		goto on_error;
	}

	memcpy(&file_hash, packfile_hash, GIT_OID_RAWSZ);
	git_mwindow_close(&w);

	git_filebuf_write(&idx->index_file, &file_hash, sizeof(git_oid));

	/* Write out the packfile trailer to the idx file as well */
	if (git_filebuf_hash(&file_hash, &idx->index_file) < 0)
		goto on_error;

	git_filebuf_write(&idx->index_file, &file_hash, sizeof(git_oid));

	/* Figure out what the final name should be */
	if (index_path_stream(&filename, idx, ".idx") < 0)
		goto on_error;

	/* Commit file */
	if (git_filebuf_commit_at(&idx->index_file, filename.ptr, GIT_PACK_FILE_MODE) < 0)
		goto on_error;

	git_mwindow_free_all(&idx->pack->mwf);
709
	p_close(idx->pack->mwf.fd);
710 711 712 713 714 715 716 717

	if (index_path_stream(&filename, idx, ".pack") < 0)
		goto on_error;
	/* And don't forget to rename the packfile to its new place. */
	if (git_filebuf_commit_at(&idx->pack_file, filename.ptr, GIT_PACK_FILE_MODE) < 0)
		return -1;

	git_buf_free(&filename);
718 719 720
	return 0;

on_error:
721
	git_mwindow_free_all(&idx->pack->mwf);
722
	p_close(idx->pack->mwf.fd);
723 724
	git_filebuf_cleanup(&idx->index_file);
	git_buf_free(&filename);
725
	git_hash_ctx_cleanup(&ctx);
726 727 728
	return -1;
}

729 730 731 732 733 734 735 736 737 738 739 740 741
void git_indexer_stream_free(git_indexer_stream *idx)
{
	unsigned int i;
	struct entry *e;
	struct git_pack_entry *pe;
	struct delta_info *delta;

	if (idx == NULL)
		return;

	git_vector_foreach(&idx->objects, i, e)
		git__free(e);
	git_vector_free(&idx->objects);
742 743 744 745 746
	if (idx->pack) {
		git_vector_foreach(&idx->pack->cache, i, pe)
			git__free(pe);
		git_vector_free(&idx->pack->cache);
	}
747 748 749 750 751 752 753
	git_vector_foreach(&idx->deltas, i, delta)
		git__free(delta);
	git_vector_free(&idx->deltas);
	git__free(idx->pack);
	git__free(idx);
}

754 755 756 757 758 759 760 761 762 763
int git_indexer_new(git_indexer **out, const char *packname)
{
	git_indexer *idx;
	struct git_pack_header hdr;
	int error;

	assert(out && packname);

	idx = git__calloc(1, sizeof(git_indexer));
	GITERR_CHECK_ALLOC(idx);
764

765 766 767
	open_pack(&idx->pack, packname);

	if ((error = parse_header(&hdr, idx->pack)) < 0)
768 769
		goto cleanup;

770
	idx->nr_objects = ntohl(hdr.hdr_entries);
771

772 773 774 775
	/* for now, limit to 2^32 objects */
	assert(idx->nr_objects == (size_t)((unsigned int)idx->nr_objects));

	error = git_vector_init(&idx->pack->cache, (unsigned int)idx->nr_objects, cache_cmp);
776
	if (error < 0)
777 778 779
		goto cleanup;

	idx->pack->has_cache = 1;
780
	error = git_vector_init(&idx->objects, (unsigned int)idx->nr_objects, objects_cmp);
781
	if (error < 0)
782 783
		goto cleanup;

784 785
	*out = idx;

786
	return 0;
787 788

cleanup:
789
	git_indexer_free(idx);
790

791
	return -1;
792 793
}

794
static int index_path(git_buf *path, git_indexer *idx)
Carlos Martín Nieto committed
795
{
796
	const char prefix[] = "pack-", suffix[] = ".idx";
797
	size_t slash = (size_t)path->size;
798

799 800 801
	/* search backwards for '/' */
	while (slash > 0 && path->ptr[slash - 1] != '/')
		slash--;
802

803
	if (git_buf_grow(path, slash + 1 + strlen(prefix) +
804 805
					 GIT_OID_HEXSZ + strlen(suffix) + 1) < 0)
		return -1;
806

807
	git_buf_truncate(path, slash);
808
	git_buf_puts(path, prefix);
nulltoken committed
809
	git_oid_fmt(path->ptr + git_buf_len(path), &idx->hash);
810 811 812
	path->size += GIT_OID_HEXSZ;
	git_buf_puts(path, suffix);

813
	return git_buf_oom(path) ? -1 : 0;
814 815
}

816
int git_indexer_write(git_indexer *idx)
817 818
{
	git_mwindow *w = NULL;
819
	int error;
820
	unsigned int i, long_offsets = 0, left;
821
	struct git_pack_idx_header hdr;
822
	git_buf filename = GIT_BUF_INIT;
823 824 825
	struct entry *entry;
	void *packfile_hash;
	git_oid file_hash;
826
	git_hash_ctx ctx;
827

828 829
	if (git_hash_ctx_init(&ctx) < 0)
		return -1;
830 831 832

	git_vector_sort(&idx->objects);

833 834 835
	git_buf_sets(&filename, idx->pack->pack_name);
	git_buf_truncate(&filename, filename.size - strlen("pack"));
	git_buf_puts(&filename, "idx");
836
	if (git_buf_oom(&filename))
837
		return -1;
838 839

	error = git_filebuf_open(&idx->file, filename.ptr, GIT_FILEBUF_HASH_CONTENTS);
840
	if (error < 0)
841
		goto cleanup;
842 843 844 845 846

	/* Write out the header */
	hdr.idx_signature = htonl(PACK_IDX_SIGNATURE);
	hdr.idx_version = htonl(2);
	error = git_filebuf_write(&idx->file, &hdr, sizeof(hdr));
847
	if (error < 0)
848
		goto cleanup;
849 850 851 852 853

	/* Write out the fanout table */
	for (i = 0; i < 256; ++i) {
		uint32_t n = htonl(idx->fanout[i]);
		error = git_filebuf_write(&idx->file, &n, sizeof(n));
854
		if (error < 0)
855 856 857 858 859
			goto cleanup;
	}

	/* Write out the object names (SHA-1 hashes) */
	git_vector_foreach(&idx->objects, i, entry) {
860
		if ((error = git_filebuf_write(&idx->file, &entry->oid, sizeof(git_oid))) < 0 ||
861
			(error = git_hash_update(&ctx, &entry->oid, GIT_OID_RAWSZ)) < 0)
862 863
			goto cleanup;
	}
864

865
	if ((error = git_hash_final(&idx->hash, &ctx)) < 0)
866
		goto cleanup;
867 868 869 870

	/* Write out the CRC32 values */
	git_vector_foreach(&idx->objects, i, entry) {
		error = git_filebuf_write(&idx->file, &entry->crc, sizeof(uint32_t));
871
		if (error < 0)
872 873 874 875 876 877 878 879 880 881 882 883 884
			goto cleanup;
	}

	/* Write out the offsets */
	git_vector_foreach(&idx->objects, i, entry) {
		uint32_t n;

		if (entry->offset == UINT32_MAX)
			n = htonl(0x80000000 | long_offsets++);
		else
			n = htonl(entry->offset);

		error = git_filebuf_write(&idx->file, &n, sizeof(uint32_t));
885
		if (error < 0)
886 887 888 889 890 891 892 893 894 895 896 897 898 899
			goto cleanup;
	}

	/* Write out the long offsets */
	git_vector_foreach(&idx->objects, i, entry) {
		uint32_t split[2];

		if (entry->offset != UINT32_MAX)
			continue;

		split[0] = htonl(entry->offset_long >> 32);
		split[1] = htonl(entry->offset_long & 0xffffffff);

		error = git_filebuf_write(&idx->file, &split, sizeof(uint32_t) * 2);
900
		if (error < 0)
901 902 903 904 905
			goto cleanup;
	}

	/* Write out the packfile trailer */

906
	packfile_hash = git_mwindow_open(&idx->pack->mwf, &w, idx->pack->mwf.size - GIT_OID_RAWSZ, GIT_OID_RAWSZ, &left);
907
	git_mwindow_close(&w);
908
	if (packfile_hash == NULL) {
909
		error = -1;
910 911 912 913 914 915 916 917
		goto cleanup;
	}

	memcpy(&file_hash, packfile_hash, GIT_OID_RAWSZ);

	git_mwindow_close(&w);

	error = git_filebuf_write(&idx->file, &file_hash, sizeof(git_oid));
918 919
	if (error < 0)
		goto cleanup;
920 921 922

	/* Write out the index sha */
	error = git_filebuf_hash(&file_hash, &idx->file);
923
	if (error < 0)
924 925 926
		goto cleanup;

	error = git_filebuf_write(&idx->file, &file_hash, sizeof(git_oid));
927
	if (error < 0)
928 929 930
		goto cleanup;

	/* Figure out what the final name should be */
931
	error = index_path(&filename, idx);
932
	if (error < 0)
933 934
		goto cleanup;

935
	/* Commit file */
936
	error = git_filebuf_commit_at(&idx->file, filename.ptr, GIT_PACK_FILE_MODE);
937 938

cleanup:
939
	git_mwindow_free_all(&idx->pack->mwf);
940
	git_mwindow_file_deregister(&idx->pack->mwf);
941
	if (error < 0)
942
		git_filebuf_cleanup(&idx->file);
943
	git_buf_free(&filename);
944
	git_hash_ctx_cleanup(&ctx);
945 946 947 948

	return error;
}

949
int git_indexer_run(git_indexer *idx, git_transfer_progress *stats)
950 951
{
	git_mwindow_file *mwf;
952
	git_off_t off = sizeof(struct git_pack_header);
Carlos Martín Nieto committed
953
	int error;
954 955
	struct entry *entry;
	unsigned int left, processed;
Carlos Martín Nieto committed
956

957
	assert(idx && stats);
958

959
	mwf = &idx->pack->mwf;
Carlos Martín Nieto committed
960
	error = git_mwindow_file_register(mwf);
961 962
	if (error < 0)
		return error;
Carlos Martín Nieto committed
963

964 965
	stats->total_objects = (unsigned int)idx->nr_objects;
	stats->indexed_objects = processed = 0;
Carlos Martín Nieto committed
966

967
	while (processed < idx->nr_objects) {
968 969
		git_rawobj obj;
		git_oid oid;
970
		struct git_pack_entry *pentry;
971
		git_mwindow *w = NULL;
972
		int i;
973
		git_off_t entry_start = off;
974 975
		void *packed;
		size_t entry_size;
976
		char fmt[GIT_OID_HEXSZ] = {0};
977

978 979
		entry = git__calloc(1, sizeof(*entry));
		GITERR_CHECK_ALLOC(entry);
980 981

		if (off > UINT31_MAX) {
982 983
			entry->offset = UINT32_MAX;
			entry->offset_long = off;
984
		} else {
985
			entry->offset = (uint32_t)off;
986 987 988
		}

		error = git_packfile_unpack(&obj, idx->pack, &off);
989
		if (error < 0)
990 991
			goto cleanup;

992
		/* FIXME: Parse the object instead of hashing it */
993
		error = git_odb__hashobj(&oid, &obj);
994
		if (error < 0) {
995
			giterr_set(GITERR_INDEXER, "Failed to hash object");
996 997 998
			goto cleanup;
		}

999 1000
		pentry = git__malloc(sizeof(struct git_pack_entry));
		if (pentry == NULL) {
1001
			error = -1;
1002 1003
			goto cleanup;
		}
1004

1005 1006
		git_oid_cpy(&pentry->sha1, &oid);
		pentry->offset = entry_start;
1007
		git_oid_fmt(fmt, &oid);
1008
		error = git_vector_insert(&idx->pack->cache, pentry);
1009
		if (error < 0)
1010 1011
			goto cleanup;

1012 1013 1014
		git_oid_cpy(&entry->oid, &oid);
		entry->crc = crc32(0L, Z_NULL, 0);

1015
		entry_size = (size_t)(off - entry_start);
1016 1017
		packed = git_mwindow_open(mwf, &w, entry_start, entry_size, &left);
		if (packed == NULL) {
1018
			error = -1;
1019 1020
			goto cleanup;
		}
1021
		entry->crc = htonl(crc32(entry->crc, packed, (uInt)entry_size));
1022
		git_mwindow_close(&w);
1023

1024
		/* Add the object to the list */
1025
		error = git_vector_insert(&idx->objects, entry);
1026
		if (error < 0)
1027
			goto cleanup;
1028 1029

		for (i = oid.id[0]; i < 256; ++i) {
1030
			idx->fanout[i]++;
1031 1032
		}

1033
		git__free(obj.data);
1034

1035
		stats->indexed_objects = ++processed;
1036 1037 1038 1039 1040
	}

cleanup:
	git_mwindow_free_all(mwf);

Carlos Martín Nieto committed
1041
	return error;
1042

Carlos Martín Nieto committed
1043 1044
}

1045
void git_indexer_free(git_indexer *idx)
1046
{
1047 1048
	unsigned int i;
	struct entry *e;
1049
	struct git_pack_entry *pe;
1050

1051 1052 1053
	if (idx == NULL)
		return;

1054
	p_close(idx->pack->mwf.fd);
1055
	git_mwindow_file_deregister(&idx->pack->mwf);
1056
	git_vector_foreach(&idx->objects, i, e)
1057
		git__free(e);
1058
	git_vector_free(&idx->objects);
1059
	git_vector_foreach(&idx->pack->cache, i, pe)
1060
		git__free(pe);
1061
	git_vector_free(&idx->pack->cache);
1062 1063
	git__free(idx->pack);
	git__free(idx);
1064
}
1065