Commit a7c60cfc by Shawn O. Pearce

Add basic support to read pack-*.idx v1 and v2 files

The index data is mapped into memory and then scanned using a
binary search algorithm to locate the matching entry for the
supplied git_oid.  The standard fanout hash trick is applied to
reduce the search space by 8 iterations.

Since the v1 and v2 file formats differ in their search function,
due to the different layouts used for the object records, we use
two different search implementations and a virtual function pointer
to jump to the correct version of code for the current pack index.
The single function jump per-pack should be faster then computing
a branch point inside the inner loop of a common binary search.

To improve concurrency during read operations the pack lock is only
held while verifying the index is actually open, or while opening
the index for the first time.  This permits multiple concurrent
readers to scan through the same index.

If an invalid index file is opened we close it and mark the
git_pack's invalid bit to true.  The git_pack structure is kept
around in its parent git_packlist, but the invalid bit will cause
all future readers to skip over the pack entirely.  Pruning the
invalid entries is relatively unimportant because they shouldn't
be very common, a $GIT_DIRECTORY/objects/pack directory tends to
only have valid pack files.

Signed-off-by: Shawn O. Pearce <spearce@spearce.org>
parent 20e7f426
...@@ -28,15 +28,44 @@ ...@@ -28,15 +28,44 @@
#include "git/zlib.h" #include "git/zlib.h"
#include "fileops.h" #include "fileops.h"
#include "hash.h" #include "hash.h"
#include <arpa/inet.h>
#include <stdio.h> #include <stdio.h>
#include "odb.h"
#define GIT_PACK_NAME_MAX (5 + 40 + 1) #define GIT_PACK_NAME_MAX (5 + 40 + 1)
typedef struct { struct git_pack {
git_odb *db;
git_lck lock; git_lck lock;
/** Functions to access idx_map. */
int (*idx_search)(
off_t *,
struct git_pack *,
const git_oid *);
/** The .idx file, mapped into memory. */
git_file idx_fd;
gitfo_map idx_map;
uint32_t *im_crc;
uint32_t *im_offset32;
uint32_t *im_offset64;
/** Number of objects in this pack. */
uint32_t obj_cnt;
/** Number of git_packlist we appear in. */
unsigned int refcnt; unsigned int refcnt;
/** Number of active users of the idx_map data. */
unsigned int idxcnt;
unsigned
invalid:1 /* the pack is unable to be read by libgit2 */
;
/** Name of the pack file(s), without extension ("pack-abc"). */
char pack_name[GIT_PACK_NAME_MAX]; char pack_name[GIT_PACK_NAME_MAX];
} git_pack; };
typedef struct git_pack git_pack;
typedef struct { typedef struct {
size_t n_packs; size_t n_packs;
...@@ -77,6 +106,17 @@ static struct { ...@@ -77,6 +106,17 @@ static struct {
{ "REF_DELTA", 0 } /* 7 = GIT_OBJ_REF_DELTA */ { "REF_DELTA", 0 } /* 7 = GIT_OBJ_REF_DELTA */
}; };
GIT_INLINE(uint32_t) decode32(void *b)
{
return ntohl(*((uint32_t*)b));
}
GIT_INLINE(uint64_t) decode64(void *b)
{
uint32_t *p = b;
return (((uint64_t)ntohl(p[0])) << 32) | ntohl(p[1]);
}
const char *git_obj_type_to_string(git_otype type) const char *git_obj_type_to_string(git_otype type)
{ {
if (type < 0 || type >= ARRAY_SIZE(obj_type_table)) if (type < 0 || type >= ARRAY_SIZE(obj_type_table))
...@@ -455,6 +495,160 @@ static int open_alternates(git_odb *db) ...@@ -455,6 +495,160 @@ static int open_alternates(git_odb *db)
return 0; return 0;
} }
static int pack_openidx_map(git_pack *p)
{
char pb[GIT_PATH_MAX];
off_t len;
if (git__fmt(pb, sizeof(pb), "%s/pack/%s.idx",
p->db->objects_dir,
p->pack_name) < 0)
return GIT_ERROR;
if ((p->idx_fd = gitfo_open(pb, O_RDONLY)) < 0)
return GIT_ERROR;
if ((len = gitfo_size(p->idx_fd)) < 0
|| !git__is_sizet(len)
|| gitfo_map_ro(&p->idx_map, p->idx_fd, 0, (size_t)len)) {
gitfo_close(p->idx_fd);
return GIT_ERROR;
}
return GIT_SUCCESS;
}
static int idxv1_search(off_t *out, git_pack *p, const git_oid *id)
{
unsigned char *data = p->idx_map.data;
size_t lo = id->id[0] ? decode32(data + ((id->id[0]-1) << 2)) : 0;
size_t hi = decode32(data + (id->id[0] << 2));
data += 1024;
do {
size_t mid = (lo + hi) >> 1;
size_t pos = 24 * mid;
int cmp = memcmp(id->id, data + pos + 4, 20);
if (cmp < 0)
hi = mid;
else if (!cmp) {
*out = decode32(data + pos);
return GIT_SUCCESS;
} else
lo = mid + 1;
} while (lo < hi);
return GIT_ENOTFOUND;
}
static int pack_openidx_v1(git_pack *p)
{
uint32_t *fanout = p->idx_map.data;
size_t expsz;
int j;
for (j = 1; j < 256; j++) {
if (decode32(&fanout[j]) < decode32(&fanout[j - 1]))
return GIT_ERROR;
}
p->obj_cnt = decode32(&fanout[255]);
expsz = 4 * 256 + 24 * p->obj_cnt + 2 * 20;
if (expsz != p->idx_map.len)
return GIT_ERROR;
p->idx_search = idxv1_search;
return GIT_SUCCESS;
}
static int idxv2_search(off_t *out, git_pack *p, const git_oid *id)
{
unsigned char *data = ((unsigned char*)p->idx_map.data) + 8;
size_t lo = id->id[0] ? decode32(data + ((id->id[0]-1) << 2)) : 0;
size_t hi = decode32(data + (id->id[0] << 2));
data += 1024;
do {
size_t mid = (lo + hi) >> 1;
size_t pos = 20 * mid;
int cmp = memcmp(id->id, data + pos, 20);
if (cmp < 0)
hi = mid;
else if (!cmp) {
uint32_t o32 = decode32(p->im_offset32 + mid);
if (o32 & 0x80000000)
*out = decode64(p->im_offset64 + 2*(o32 & ~0x80000000));
else
*out = o32;
return GIT_SUCCESS;
} else
lo = mid + 1;
} while (lo < hi);
return GIT_ENOTFOUND;
}
static int pack_openidx_v2(git_pack *p)
{
void *data = ((unsigned char*)p->idx_map.data) + 8;
uint32_t *fanout = data;
int j;
for (j = 1; j < 256; j++) {
if (decode32(&fanout[j]) < decode32(&fanout[j - 1]))
return GIT_ERROR;
}
p->obj_cnt = decode32(&fanout[255]);
p->idx_search = idxv2_search;
p->im_crc = (uint32_t*)(data + 1024 + (20 * p->obj_cnt));
p->im_offset32 = p->im_crc + p->obj_cnt;
p->im_offset64 = p->im_offset32 + p->obj_cnt;
return GIT_SUCCESS;
}
static int pack_openidx(git_pack *p)
{
gitlck_lock(&p->lock);
if (p->invalid)
goto unlock_fail;
if (++p->idxcnt == 1 && !p->idx_search) {
uint32_t *data;
if (pack_openidx_map(p))
goto invalid_fail;
data = p->idx_map.data;
if (decode32(&data[0]) == PACK_TOC) {
switch (decode32(&data[1])) {
case 2:
if (pack_openidx_v2(p))
goto unmap_fail;
break;
default:
goto unmap_fail;
}
} else if (pack_openidx_v1(p))
goto unmap_fail;
}
gitlck_unlock(&p->lock);
return GIT_SUCCESS;
unmap_fail:
gitfo_free_map(&p->idx_map);
invalid_fail:
p->invalid = 1;
p->idxcnt--;
unlock_fail:
gitlck_unlock(&p->lock);
return GIT_ERROR;
}
static void pack_decidx(git_pack *p)
{
gitlck_lock(&p->lock);
p->idxcnt--;
gitlck_unlock(&p->lock);
}
static void pack_dec(git_pack *p) static void pack_dec(git_pack *p)
{ {
int need_free; int need_free;
...@@ -464,6 +658,11 @@ static void pack_dec(git_pack *p) ...@@ -464,6 +658,11 @@ static void pack_dec(git_pack *p)
gitlck_unlock(&p->lock); gitlck_unlock(&p->lock);
if (need_free) { if (need_free) {
if (p->idx_search) {
gitfo_free_map(&p->idx_map);
gitfo_close(p->idx_fd);
}
gitlck_free(&p->lock); gitlck_free(&p->lock);
free(p); free(p);
} }
...@@ -552,6 +751,7 @@ static git_packlist* scan_packs(git_odb *db) ...@@ -552,6 +751,7 @@ static git_packlist* scan_packs(git_odb *db)
for (cnt = 0, c = state; c; ) { for (cnt = 0, c = state; c; ) {
struct scanned_pack *n = c->next; struct scanned_pack *n = c->next;
c->pack->db = db;
new_list->packs[cnt++] = c->pack; new_list->packs[cnt++] = c->pack;
free(c); free(c);
c = n; c = n;
...@@ -676,7 +876,20 @@ int git_odb__read_loose(git_obj *out, git_odb *db, const git_oid *id) ...@@ -676,7 +876,20 @@ int git_odb__read_loose(git_obj *out, git_odb *db, const git_oid *id)
static int read_packed(git_obj *out, git_pack *p, const git_oid *id) static int read_packed(git_obj *out, git_pack *p, const git_oid *id)
{ {
return GIT_ENOTFOUND; off_t pos;
int res;
if (pack_openidx(p))
return GIT_ERROR;
res = p->idx_search(&pos, p, id);
pack_decidx(p);
if (!res) {
/* TODO unpack object at pos */
res = GIT_ERROR;
}
return res;
} }
int git_odb__read_packed(git_obj *out, git_odb *db, const git_oid *id) int git_odb__read_packed(git_obj *out, git_odb *db, const git_oid *id)
......
#ifndef INCLUDE_odb_h__
#define INCLUDE_odb_h__
/** First 4 bytes of a pack-*.idx file header.
*
* Note this header exists only in idx v2 and later. The idx v1
* file format does not have a magic sequence at the front, and
* must be detected by the first four bytes *not* being this value
* and the first 8 bytes matching the following expression:
*
* uint32_t *fanout = ... the file data at offset 0 ...
* ntohl(fanout[0]) < ntohl(fanout[1])
*
* The value chosen here for PACK_TOC is such that the above
* cannot be true for an idx v1 file.
*/
#define PACK_TOC 0xff744f63 /* -1tOc */
#endif
...@@ -31,6 +31,13 @@ extern int git__fmt(char *, size_t, const char *, ...) ...@@ -31,6 +31,13 @@ extern int git__fmt(char *, size_t, const char *, ...)
extern int git__prefixcmp(const char *str, const char *prefix); extern int git__prefixcmp(const char *str, const char *prefix);
extern int git__suffixcmp(const char *str, const char *suffix); extern int git__suffixcmp(const char *str, const char *suffix);
/** @return true if p fits into the range of a size_t */
GIT_INLINE(int) git__is_sizet(off_t p)
{
size_t r = (size_t)p;
return p == r;
}
/* /*
* Realloc the buffer pointed at by variable 'x' so that it can hold * Realloc the buffer pointed at by variable 'x' so that it can hold
* at least 'nr' entries; the number of entries currently allocated * at least 'nr' entries; the number of entries currently allocated
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment