Commit 005e7715 by lhchavez

multipack: Introduce a parser for multi-pack-index files

This change is the first in a series to add support for git's
multi-pack-index. This should speed up large repositories significantly.

Part of: #5399
parent 6d1f1926
<V`oӤk @rshuffleDHE-PSK-AR'MID[
\ No newline at end of file
/*
* libgit2 multi-pack-index fuzzer target.
*
* Copyright (C) the libgit2 contributors. All rights reserved.
*
* This file is part of libgit2, distributed under the GNU GPL v2 with
* a Linking Exception. For full terms see the included COPYING file.
*/
#include <stdio.h>
#include "git2.h"
#include "buffer.h"
#include "common.h"
#include "futils.h"
#include "hash.h"
#include "midx.h"
int LLVMFuzzerInitialize(int *argc, char ***argv)
{
GIT_UNUSED(argc);
GIT_UNUSED(argv);
if (git_libgit2_init() < 0) {
fprintf(stderr, "Failed to initialize libgit2\n");
abort();
}
return 0;
}
int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size)
{
git_midx_file idx = {{0}};
git_midx_entry e;
git_buf midx_buf = GIT_BUF_INIT;
git_oid oid = {{0}};
bool append_hash = false;
if (size < 4)
return 0;
/*
* If the first byte in the stream has the high bit set, append the
* SHA1 hash so that the packfile is somewhat valid.
*/
append_hash = *data & 0x80;
/* Keep a 4-byte alignment to avoid unaligned accesses. */
data += 4;
size -= 4;
if (append_hash) {
if (git_buf_init(&midx_buf, size + sizeof(oid)) < 0)
goto cleanup;
if (git_hash_buf(&oid, data, size) < 0) {
fprintf(stderr, "Failed to compute the SHA1 hash\n");
abort();
}
memcpy(midx_buf.ptr, data, size);
memcpy(midx_buf.ptr + size, &oid, sizeof(oid));
} else {
git_buf_attach_notowned(&midx_buf, (char *)data, size);
}
if (git_midx_parse(&idx, (const unsigned char *)git_buf_cstr(&midx_buf), git_buf_len(&midx_buf)) < 0)
goto cleanup;
/* Search for any oid, just to exercise that codepath. */
if (git_midx_entry_find(&e, &idx, &oid, GIT_OID_HEXSZ) < 0)
goto cleanup;
cleanup:
git_midx_close(&idx);
git_buf_dispose(&midx_buf);
return 0;
}
/*
* Copyright (C) the libgit2 contributors. All rights reserved.
*
* This file is part of libgit2, distributed under the GNU GPL v2 with
* a Linking Exception. For full terms see the included COPYING file.
*/
#include "midx.h"
#include "buffer.h"
#include "futils.h"
#include "hash.h"
#include "odb.h"
#include "pack.h"
#define GIT_MIDX_FILE_MODE 0444
#define MIDX_SIGNATURE 0x4d494458 /* "MIDX" */
#define MIDX_VERSION 1
#define MIDX_OBJECT_ID_VERSION 1
struct git_midx_header {
uint32_t signature;
uint8_t version;
uint8_t object_id_version;
uint8_t chunks;
uint8_t base_midx_files;
uint32_t packfiles;
};
#define MIDX_PACKFILE_NAMES_ID 0x504e414d /* "PNAM" */
#define MIDX_OID_FANOUT_ID 0x4f494446 /* "OIDF" */
#define MIDX_OID_LOOKUP_ID 0x4f49444c /* "OIDL" */
#define MIDX_OBJECT_OFFSETS_ID 0x4f4f4646 /* "OOFF" */
#define MIDX_OBJECT_LARGE_OFFSETS_ID 0x4c4f4646 /* "LOFF" */
struct git_midx_chunk {
off64_t offset;
size_t length;
};
static int midx_error(const char *message)
{
git_error_set(GIT_ERROR_ODB, "invalid multi-pack-index file - %s", message);
return -1;
}
static int midx_parse_packfile_names(
git_midx_file *idx,
const unsigned char *data,
uint32_t packfiles,
struct git_midx_chunk *chunk)
{
int error;
uint32_t i;
char *packfile_name = (char *)(data + chunk->offset);
size_t chunk_size = chunk->length, len;
if (chunk->offset == 0)
return midx_error("missing Packfile Names chunk");
if (chunk->length == 0)
return midx_error("empty Packfile Names chunk");
if ((error = git_vector_init(&idx->packfile_names, packfiles, git__strcmp_cb)) < 0)
return error;
for (i = 0; i < packfiles; ++i) {
len = p_strnlen(packfile_name, chunk_size);
if (len == 0)
return midx_error("empty packfile name");
if (len + 1 > chunk_size)
return midx_error("unterminated packfile name");
git_vector_insert(&idx->packfile_names, packfile_name);
if (i && strcmp(git_vector_get(&idx->packfile_names, i - 1), packfile_name) >= 0)
return midx_error("packfile names are not sorted");
if (strlen(packfile_name) <= strlen(".idx") || git__suffixcmp(packfile_name, ".idx") != 0)
return midx_error("non-.idx packfile name");
if (strchr(packfile_name, '/') != NULL || strchr(packfile_name, '\\') != NULL)
return midx_error("non-local packfile");
packfile_name += len + 1;
chunk_size -= len + 1;
}
return 0;
}
static int midx_parse_oid_fanout(
git_midx_file *idx,
const unsigned char *data,
struct git_midx_chunk *chunk_oid_fanout)
{
uint32_t i, nr;
if (chunk_oid_fanout->offset == 0)
return midx_error("missing OID Fanout chunk");
if (chunk_oid_fanout->length == 0)
return midx_error("empty OID Fanout chunk");
if (chunk_oid_fanout->length != 256 * 4)
return midx_error("OID Fanout chunk has wrong length");
idx->oid_fanout = (const uint32_t *)(data + chunk_oid_fanout->offset);
nr = 0;
for (i = 0; i < 256; ++i) {
uint32_t n = ntohl(idx->oid_fanout[i]);
if (n < nr)
return midx_error("index is non-monotonic");
nr = n;
}
idx->num_objects = nr;
return 0;
}
static int midx_parse_oid_lookup(
git_midx_file *idx,
const unsigned char *data,
struct git_midx_chunk *chunk_oid_lookup)
{
uint32_t i;
git_oid *oid, *prev_oid, zero_oid = {{0}};
if (chunk_oid_lookup->offset == 0)
return midx_error("missing OID Lookup chunk");
if (chunk_oid_lookup->length == 0)
return midx_error("empty OID Lookup chunk");
if (chunk_oid_lookup->length != idx->num_objects * 20)
return midx_error("OID Lookup chunk has wrong length");
idx->oid_lookup = oid = (git_oid *)(data + chunk_oid_lookup->offset);
prev_oid = &zero_oid;
for (i = 0; i < idx->num_objects; ++i, ++oid) {
if (git_oid_cmp(prev_oid, oid) >= 0)
return midx_error("OID Lookup index is non-monotonic");
prev_oid = oid;
}
return 0;
}
static int midx_parse_object_offsets(
git_midx_file *idx,
const unsigned char *data,
struct git_midx_chunk *chunk_object_offsets)
{
if (chunk_object_offsets->offset == 0)
return midx_error("missing Object Offsets chunk");
if (chunk_object_offsets->length == 0)
return midx_error("empty Object Offsets chunk");
if (chunk_object_offsets->length != idx->num_objects * 8)
return midx_error("Object Offsets chunk has wrong length");
idx->object_offsets = data + chunk_object_offsets->offset;
return 0;
}
static int midx_parse_object_large_offsets(
git_midx_file *idx,
const unsigned char *data,
struct git_midx_chunk *chunk_object_large_offsets)
{
if (chunk_object_large_offsets->length == 0)
return 0;
if (chunk_object_large_offsets->length % 8 != 0)
return midx_error("malformed Object Large Offsets chunk");
idx->object_large_offsets = data + chunk_object_large_offsets->offset;
idx->num_object_large_offsets = chunk_object_large_offsets->length / 8;
return 0;
}
int git_midx_parse(
git_midx_file *idx,
const unsigned char *data,
size_t size)
{
struct git_midx_header *hdr;
const unsigned char *chunk_hdr;
struct git_midx_chunk *last_chunk;
uint32_t i;
off64_t last_chunk_offset, chunk_offset, trailer_offset;
git_oid idx_checksum = {{0}};
int error;
struct git_midx_chunk chunk_packfile_names = {0},
chunk_oid_fanout = {0},
chunk_oid_lookup = {0},
chunk_object_offsets = {0},
chunk_object_large_offsets = {0};
assert(idx);
if (size < sizeof(struct git_midx_header) + 20)
return midx_error("multi-pack index is too short");
hdr = ((struct git_midx_header *)data);
if (hdr->signature != htonl(MIDX_SIGNATURE) ||
hdr->version != MIDX_VERSION ||
hdr->object_id_version != MIDX_OBJECT_ID_VERSION) {
return midx_error("unsupported multi-pack index version");
}
if (hdr->chunks == 0)
return midx_error("no chunks in multi-pack index");
/*
* The very first chunk's offset should be after the header, all the chunk
* headers, and a special zero chunk.
*/
last_chunk_offset =
sizeof(struct git_midx_header) +
(1 + hdr->chunks) * 12;
trailer_offset = size - 20;
if (trailer_offset < last_chunk_offset)
return midx_error("wrong index size");
git_oid_cpy(&idx->checksum, (git_oid *)(data + trailer_offset));
if (git_hash_buf(&idx_checksum, data, (size_t)trailer_offset) < 0)
return midx_error("could not calculate signature");
if (!git_oid_equal(&idx_checksum, &idx->checksum))
return midx_error("index signature mismatch");
chunk_hdr = data + sizeof(struct git_midx_header);
last_chunk = NULL;
for (i = 0; i < hdr->chunks; ++i, chunk_hdr += 12) {
chunk_offset = ((off64_t)ntohl(*((uint32_t *)(chunk_hdr + 4)))) << 32 |
((off64_t)ntohl(*((uint32_t *)(chunk_hdr + 8))));
if (chunk_offset < last_chunk_offset)
return midx_error("chunks are non-monotonic");
if (chunk_offset >= trailer_offset)
return midx_error("chunks extend beyond the trailer");
if (last_chunk != NULL)
last_chunk->length = (size_t)(chunk_offset - last_chunk_offset);
last_chunk_offset = chunk_offset;
switch (ntohl(*((uint32_t *)(chunk_hdr + 0)))) {
case MIDX_PACKFILE_NAMES_ID:
chunk_packfile_names.offset = last_chunk_offset;
last_chunk = &chunk_packfile_names;
break;
case MIDX_OID_FANOUT_ID:
chunk_oid_fanout.offset = last_chunk_offset;
last_chunk = &chunk_oid_fanout;
break;
case MIDX_OID_LOOKUP_ID:
chunk_oid_lookup.offset = last_chunk_offset;
last_chunk = &chunk_oid_lookup;
break;
case MIDX_OBJECT_OFFSETS_ID:
chunk_object_offsets.offset = last_chunk_offset;
last_chunk = &chunk_object_offsets;
break;
case MIDX_OBJECT_LARGE_OFFSETS_ID:
chunk_object_large_offsets.offset = last_chunk_offset;
last_chunk = &chunk_object_large_offsets;
break;
default:
return midx_error("unrecognized chunk ID");
}
}
last_chunk->length = (size_t)(trailer_offset - last_chunk_offset);
error = midx_parse_packfile_names(
idx, data, ntohl(hdr->packfiles), &chunk_packfile_names);
if (error < 0)
return error;
error = midx_parse_oid_fanout(idx, data, &chunk_oid_fanout);
if (error < 0)
return error;
error = midx_parse_oid_lookup(idx, data, &chunk_oid_lookup);
if (error < 0)
return error;
error = midx_parse_object_offsets(idx, data, &chunk_object_offsets);
if (error < 0)
return error;
error = midx_parse_object_large_offsets(idx, data, &chunk_object_large_offsets);
if (error < 0)
return error;
return 0;
}
int git_midx_open(
git_midx_file **idx_out,
const char *path)
{
git_midx_file *idx;
git_file fd = -1;
size_t idx_size;
struct stat st;
int error;
/* TODO: properly open the file without access time using O_NOATIME */
fd = git_futils_open_ro(path);
if (fd < 0)
return fd;
if (p_fstat(fd, &st) < 0) {
p_close(fd);
git_error_set(GIT_ERROR_ODB, "multi-pack-index file not found - '%s'", path);
return -1;
}
if (!S_ISREG(st.st_mode) || !git__is_sizet(st.st_size)) {
p_close(fd);
git_error_set(GIT_ERROR_ODB, "invalid pack index '%s'", path);
return -1;
}
idx_size = (size_t)st.st_size;
idx = git__calloc(1, sizeof(git_midx_file));
GIT_ERROR_CHECK_ALLOC(idx);
error = git_futils_mmap_ro(&idx->index_map, fd, 0, idx_size);
p_close(fd);
if (error < 0) {
git_midx_free(idx);
return error;
}
if ((error = git_midx_parse(idx, idx->index_map.data, idx_size)) < 0) {
git_midx_free(idx);
return error;
}
*idx_out = idx;
return 0;
}
int git_midx_entry_find(
git_midx_entry *e,
git_midx_file *idx,
const git_oid *short_oid,
size_t len)
{
int pos, found = 0;
size_t pack_index;
uint32_t hi, lo;
const git_oid *current = NULL;
const unsigned char *object_offset;
off64_t offset;
assert(idx);
hi = ntohl(idx->oid_fanout[(int)short_oid->id[0]]);
lo = ((short_oid->id[0] == 0x0) ? 0 : ntohl(idx->oid_fanout[(int)short_oid->id[0] - 1]));
pos = git_pack__lookup_sha1(idx->oid_lookup, 20, lo, hi, short_oid->id);
if (pos >= 0) {
/* An object matching exactly the oid was found */
found = 1;
current = idx->oid_lookup + pos;
} else {
/* No object was found */
/* pos refers to the object with the "closest" oid to short_oid */
pos = -1 - pos;
if (pos < (int)idx->num_objects) {
current = idx->oid_lookup + pos;
if (!git_oid_ncmp(short_oid, current, len))
found = 1;
}
}
if (found && len != GIT_OID_HEXSZ && pos + 1 < (int)idx->num_objects) {
/* Check for ambiguousity */
const git_oid *next = current + 1;
if (!git_oid_ncmp(short_oid, next, len)) {
found = 2;
}
}
if (!found)
return git_odb__error_notfound("failed to find offset for multi-pack index entry", short_oid, len);
if (found > 1)
return git_odb__error_ambiguous("found multiple offsets for multi-pack index entry");
object_offset = idx->object_offsets + pos * 8;
offset = ntohl(*((uint32_t *)(object_offset + 4)));
if (offset & 0x80000000) {
uint32_t object_large_offsets_pos = offset & 0x7fffffff;
const unsigned char *object_large_offsets_index = idx->object_large_offsets;
/* Make sure we're not being sent out of bounds */
if (object_large_offsets_pos >= idx->num_object_large_offsets)
return git_odb__error_notfound("invalid index into the object large offsets table", short_oid, len);
object_large_offsets_index += 8 * object_large_offsets_pos;
offset = (((uint64_t)ntohl(*((uint32_t *)(object_large_offsets_index + 0)))) << 32) |
ntohl(*((uint32_t *)(object_large_offsets_index + 4)));
}
pack_index = ntohl(*((uint32_t *)(object_offset + 0)));
if (pack_index >= git_vector_length(&idx->packfile_names))
return midx_error("invalid index into the packfile names table");
e->pack_index = pack_index;
e->offset = offset;
git_oid_cpy(&e->sha1, current);
return 0;
}
void git_midx_close(git_midx_file *idx)
{
assert(idx);
if (idx->index_map.data)
git_futils_mmap_free(&idx->index_map);
git_vector_free(&idx->packfile_names);
}
void git_midx_free(git_midx_file *idx)
{
if (!idx)
return;
git_midx_close(idx);
git__free(idx);
}
/*
* Copyright (C) the libgit2 contributors. All rights reserved.
*
* This file is part of libgit2, distributed under the GNU GPL v2 with
* a Linking Exception. For full terms see the included COPYING file.
*/
#ifndef INCLUDE_midx_h__
#define INCLUDE_midx_h__
#include "common.h"
#include <ctype.h>
#include "map.h"
#include "mwindow.h"
/*
* A multi-pack-index file.
*
* This file contains a merged index for multiple independent .pack files. This
* can help speed up locating objects without requiring a garbage collection
* cycle to create a single .pack file.
*
* Support for this feature was added in git 2.21, and requires the
* `core.multiPackIndex` config option to be set.
*/
typedef struct git_midx_file {
git_map index_map;
/* The table of Packfile Names. */
git_vector packfile_names;
/* The OID Fanout table. */
const uint32_t *oid_fanout;
/* The total number of objects in the index. */
uint32_t num_objects;
/* The OID Lookup table. */
git_oid *oid_lookup;
/* The Object Offsets table. Each entry has two 4-byte fields with the pack index and the offset. */
const unsigned char *object_offsets;
/* The Object Large Offsets table. */
const unsigned char *object_large_offsets;
/* The number of entries in the Object Large Offsets table. Each entry has an 8-byte with an offset */
size_t num_object_large_offsets;
/* The trailer of the file. Contains the SHA1-checksum of the whole file. */
git_oid checksum;
} git_midx_file;
/*
* An entry in the multi-pack-index file. Similar in purpose to git_pack_entry.
*/
typedef struct git_midx_entry {
/* The index within idx->packfile_names where the packfile name can be found. */
size_t pack_index;
/* The offset within the .pack file where the requested object is found. */
off64_t offset;
/* The SHA-1 hash of the requested object. */
git_oid sha1;
} git_midx_entry;
int git_midx_open(
git_midx_file **idx_out,
const char *path);
int git_midx_entry_find(
git_midx_entry *e,
git_midx_file *idx,
const git_oid *short_oid,
size_t len);
void git_midx_close(git_midx_file *idx);
void git_midx_free(git_midx_file *idx);
/* This is exposed for use in the fuzzers. */
int git_midx_parse(
git_midx_file *idx,
const unsigned char *data,
size_t size);
#endif
......@@ -1257,14 +1257,14 @@ int git_pack_foreach_entry(
return error;
}
static int sha1_position(const void *table, size_t stride, unsigned lo,
unsigned hi, const unsigned char *key)
int git_pack__lookup_sha1(const void *oid_lookup_table, size_t stride, unsigned lo,
unsigned hi, const unsigned char *oid_prefix)
{
const unsigned char *base = table;
const unsigned char *base = oid_lookup_table;
while (lo < hi) {
unsigned mi = (lo + hi) / 2;
int cmp = git_oid__hashcmp(base + mi * stride, key);
int cmp = git_oid__hashcmp(base + mi * stride, oid_prefix);
if (!cmp)
return mi;
......@@ -1326,7 +1326,7 @@ static int pack_entry_find_offset(
short_oid->id[0], short_oid->id[1], short_oid->id[2], lo, hi, p->num_objects);
#endif
pos = sha1_position(index, stride, lo, hi, short_oid->id);
pos = git_pack__lookup_sha1(index, stride, lo, hi, short_oid->id);
if (pos >= 0) {
/* An object matching exactly the oid was found */
......
......@@ -106,6 +106,19 @@ struct git_pack_file {
char pack_name[GIT_FLEX_ARRAY]; /* more */
};
/**
* Return the position where an OID (or a prefix) would be inserted within the
* OID Lookup Table of an .idx file. This performs binary search between the lo
* and hi indices.
*
* The stride parameter is provided because .idx files version 1 store the OIDs
* interleaved with the 4-byte file offsets of the objects within the .pack
* file (stride = 24), whereas files with version 2 store them in a contiguous
* flat array (stride = 20).
*/
int git_pack__lookup_sha1(const void *oid_lookup_table, size_t stride, unsigned lo,
unsigned hi, const unsigned char *oid_prefix);
struct git_pack_entry {
off64_t offset;
git_oid sha1;
......
#include "clar_libgit2.h"
#include <git2.h>
#include "midx.h"
void test_pack_midx__parse(void)
{
git_repository *repo;
struct git_midx_file *idx;
struct git_midx_entry e;
git_oid id;
git_buf midx_path = GIT_BUF_INIT;
cl_git_pass(git_repository_open(&repo, cl_fixture("testrepo.git")));
cl_git_pass(git_buf_joinpath(&midx_path, git_repository_path(repo), "objects/pack/multi-pack-index"));
cl_git_pass(git_midx_open(&idx, git_buf_cstr(&midx_path)));
cl_git_pass(git_oid_fromstr(&id, "5001298e0c09ad9c34e4249bc5801c75e9754fa5"));
cl_git_pass(git_midx_entry_find(&e, idx, &id, GIT_OID_HEXSZ));
cl_assert_equal_oid(&e.sha1, &id);
cl_assert_equal_s(
(const char *)git_vector_get(&idx->packfile_names, e.pack_index),
"pack-d7c6adf9f61318f041845b01440d09aa7a91e1b5.idx");
git_midx_free(idx);
git_repository_free(repo);
git_buf_dispose(&midx_path);
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment