Commit 0e040c03 by Carlos Martín Nieto

indexer: use a hashtable for keeping track of offsets

These offsets are needed for REF_DELTA objects, which encode which
object they use as a base, but not where it lies in the packfile, so
we need a list.

These objects are mostly from older packfiles, before OFS_DELTA was
widely spread. The time spent in indexing these packfiles is greatly
reduced, though remains above what git is able to do.
parent 29ab8774
...@@ -17,6 +17,7 @@ ...@@ -17,6 +17,7 @@
#include "posix.h" #include "posix.h"
#include "pack.h" #include "pack.h"
#include "filebuf.h" #include "filebuf.h"
#include "oidmap.h"
#define UINT31_MAX (0x7FFFFFFF) #define UINT31_MAX (0x7FFFFFFF)
...@@ -122,14 +123,6 @@ static int objects_cmp(const void *a, const void *b) ...@@ -122,14 +123,6 @@ static int objects_cmp(const void *a, const void *b)
return git_oid_cmp(&entrya->oid, &entryb->oid); return git_oid_cmp(&entrya->oid, &entryb->oid);
} }
static int cache_cmp(const void *a, const void *b)
{
const struct git_pack_entry *ea = a;
const struct git_pack_entry *eb = b;
return git_oid_cmp(&ea->sha1, &eb->sha1);
}
int git_indexer_stream_new( int git_indexer_stream_new(
git_indexer_stream **out, git_indexer_stream **out,
const char *prefix, const char *prefix,
...@@ -271,7 +264,8 @@ static int crc_object(uint32_t *crc_out, git_mwindow_file *mwf, git_off_t start, ...@@ -271,7 +264,8 @@ static int crc_object(uint32_t *crc_out, git_mwindow_file *mwf, git_off_t start,
static int store_object(git_indexer_stream *idx) static int store_object(git_indexer_stream *idx)
{ {
int i; int i, error;
khiter_t k;
git_oid oid; git_oid oid;
struct entry *entry; struct entry *entry;
git_off_t entry_size; git_off_t entry_size;
...@@ -296,11 +290,15 @@ static int store_object(git_indexer_stream *idx) ...@@ -296,11 +290,15 @@ static int store_object(git_indexer_stream *idx)
git_oid_cpy(&pentry->sha1, &oid); git_oid_cpy(&pentry->sha1, &oid);
pentry->offset = entry_start; pentry->offset = entry_start;
if (git_vector_insert(&idx->pack->cache, pentry) < 0) {
k = kh_put(oid, idx->pack->idx_cache, &pentry->sha1, &error);
if (!error) {
git__free(pentry); git__free(pentry);
goto on_error; goto on_error;
} }
kh_value(idx->pack->idx_cache, k) = pentry;
git_oid_cpy(&entry->oid, &oid); git_oid_cpy(&entry->oid, &oid);
if (crc_object(&entry->crc, &idx->pack->mwf, entry_start, entry_size) < 0) if (crc_object(&entry->crc, &idx->pack->mwf, entry_start, entry_size) < 0)
...@@ -324,7 +322,8 @@ on_error: ...@@ -324,7 +322,8 @@ on_error:
static int hash_and_save(git_indexer_stream *idx, git_rawobj *obj, git_off_t entry_start) static int hash_and_save(git_indexer_stream *idx, git_rawobj *obj, git_off_t entry_start)
{ {
int i; int i, error;
khiter_t k;
git_oid oid; git_oid oid;
size_t entry_size; size_t entry_size;
struct entry *entry; struct entry *entry;
...@@ -351,11 +350,14 @@ static int hash_and_save(git_indexer_stream *idx, git_rawobj *obj, git_off_t ent ...@@ -351,11 +350,14 @@ static int hash_and_save(git_indexer_stream *idx, git_rawobj *obj, git_off_t ent
git_oid_cpy(&pentry->sha1, &oid); git_oid_cpy(&pentry->sha1, &oid);
pentry->offset = entry_start; pentry->offset = entry_start;
if (git_vector_insert(&idx->pack->cache, pentry) < 0) { k = kh_put(oid, idx->pack->idx_cache, &pentry->sha1, &error);
if (!error) {
git__free(pentry); git__free(pentry);
goto on_error; goto on_error;
} }
kh_value(idx->pack->idx_cache, k) = pentry;
git_oid_cpy(&entry->oid, &oid); git_oid_cpy(&entry->oid, &oid);
entry->crc = crc32(0L, Z_NULL, 0); entry->crc = crc32(0L, Z_NULL, 0);
...@@ -426,8 +428,8 @@ int git_indexer_stream_add(git_indexer_stream *idx, const void *data, size_t siz ...@@ -426,8 +428,8 @@ int git_indexer_stream_add(git_indexer_stream *idx, const void *data, size_t siz
/* for now, limit to 2^32 objects */ /* for now, limit to 2^32 objects */
assert(idx->nr_objects == (size_t)((unsigned int)idx->nr_objects)); assert(idx->nr_objects == (size_t)((unsigned int)idx->nr_objects));
if (git_vector_init(&idx->pack->cache, (unsigned int)idx->nr_objects, cache_cmp) < 0) idx->pack->idx_cache = git_oidmap_alloc();
return -1; GITERR_CHECK_ALLOC(idx->pack->idx_cache);
idx->pack->has_cache = 1; idx->pack->has_cache = 1;
if (git_vector_init(&idx->objects, (unsigned int)idx->nr_objects, objects_cmp) < 0) if (git_vector_init(&idx->objects, (unsigned int)idx->nr_objects, objects_cmp) < 0)
...@@ -718,9 +720,9 @@ on_error: ...@@ -718,9 +720,9 @@ on_error:
void git_indexer_stream_free(git_indexer_stream *idx) void git_indexer_stream_free(git_indexer_stream *idx)
{ {
khiter_t k;
unsigned int i; unsigned int i;
struct entry *e; struct entry *e;
struct git_pack_entry *pe;
struct delta_info *delta; struct delta_info *delta;
if (idx == NULL) if (idx == NULL)
...@@ -729,11 +731,16 @@ void git_indexer_stream_free(git_indexer_stream *idx) ...@@ -729,11 +731,16 @@ void git_indexer_stream_free(git_indexer_stream *idx)
git_vector_foreach(&idx->objects, i, e) git_vector_foreach(&idx->objects, i, e)
git__free(e); git__free(e);
git_vector_free(&idx->objects); git_vector_free(&idx->objects);
if (idx->pack) { if (idx->pack) {
git_vector_foreach(&idx->pack->cache, i, pe) for (k = kh_begin(idx->pack->idx_cache); k != kh_end(idx->pack->idx_cache); k++) {
git__free(pe); if (kh_exist(idx->pack->idx_cache, k))
git_vector_free(&idx->pack->cache); git__free(kh_value(idx->pack->idx_cache, k));
}
git_oidmap_free(idx->pack->idx_cache);
} }
git_vector_foreach(&idx->deltas, i, delta) git_vector_foreach(&idx->deltas, i, delta)
git__free(delta); git__free(delta);
git_vector_free(&idx->deltas); git_vector_free(&idx->deltas);
......
...@@ -21,8 +21,6 @@ ...@@ -21,8 +21,6 @@
#include "git2/indexer.h" #include "git2/indexer.h"
#include "git2/config.h" #include "git2/config.h"
GIT__USE_OIDMAP;
struct unpacked { struct unpacked {
git_pobject *object; git_pobject *object;
void *data; void *data;
......
...@@ -760,13 +760,14 @@ git_off_t get_delta_base( ...@@ -760,13 +760,14 @@ git_off_t get_delta_base(
} else if (type == GIT_OBJ_REF_DELTA) { } else if (type == GIT_OBJ_REF_DELTA) {
/* If we have the cooperative cache, search in it first */ /* If we have the cooperative cache, search in it first */
if (p->has_cache) { if (p->has_cache) {
size_t pos; khiter_t k;
struct git_pack_entry key; git_oid oid;
git_oid_fromraw(&key.sha1, base_info); git_oid_fromraw(&oid, base_info);
if (!git_vector_bsearch(&pos, &p->cache, &key)) { k = kh_get(oid, p->idx_cache, &oid);
if (k != kh_end(p->idx_cache)) {
*curpos += 20; *curpos += 20;
return ((struct git_pack_entry *)git_vector_get(&p->cache, pos))->offset; return ((struct git_pack_entry *)kh_value(p->idx_cache, k))->offset;
} }
} }
/* The base entry _must_ be in the same pack */ /* The base entry _must_ be in the same pack */
......
...@@ -16,6 +16,7 @@ ...@@ -16,6 +16,7 @@
#include "map.h" #include "map.h"
#include "mwindow.h" #include "mwindow.h"
#include "odb.h" #include "odb.h"
#include "oidmap.h"
#define GIT_PACK_FILE_MODE 0444 #define GIT_PACK_FILE_MODE 0444
...@@ -62,6 +63,7 @@ typedef struct git_pack_cache_entry { ...@@ -62,6 +63,7 @@ typedef struct git_pack_cache_entry {
#include "offmap.h" #include "offmap.h"
GIT__USE_OFFMAP; GIT__USE_OFFMAP;
GIT__USE_OIDMAP;
#define GIT_PACK_CACHE_MEMORY_LIMIT 16 * 1024 * 1024 #define GIT_PACK_CACHE_MEMORY_LIMIT 16 * 1024 * 1024
#define GIT_PACK_CACHE_SIZE_LIMIT 1024 * 1024 /* don't bother caching anything over 1MB */ #define GIT_PACK_CACHE_SIZE_LIMIT 1024 * 1024 /* don't bother caching anything over 1MB */
...@@ -86,7 +88,7 @@ struct git_pack_file { ...@@ -86,7 +88,7 @@ struct git_pack_file {
git_time_t mtime; git_time_t mtime;
unsigned pack_local:1, pack_keep:1, has_cache:1; unsigned pack_local:1, pack_keep:1, has_cache:1;
git_oid sha1; git_oid sha1;
git_vector cache; git_oidmap *idx_cache;
git_oid **oids; git_oid **oids;
git_pack_cache bases; /* delta base cache */ git_pack_cache bases; /* delta base cache */
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment