Commit fc658755 by Vicent Marti

Rewrite git_hashtable internals

The old hash table with chained buckets has been replaced by a new one
using Cuckoo hashing, which offers guaranteed constant lookup times.
This should improve speeds on most use cases, since hash tables in
libgit2 are usually used as caches where the objects are stored once and
queried several times.

The Cuckoo hash implementation is based off the one in the Basekit
library [1] for the IO language, but rewritten to support an arbritrary
number of hashes. We currently use 3 to maximize the usage of the nodes pool.

[1]: https://github.com/stevedekorte/basekit/blob/master/source/CHash.c

Signed-off-by: Vicent Marti <tanoku@gmail.com>
parent 4378e8d4
......@@ -27,46 +27,113 @@
#include "repository.h"
#include "commit.h"
#define MAX_LOOPS 5
static const double max_load_factor = 0.65;
static void hashtable_resize(git_hashtable *table)
static int resize_to(git_hashtable *self, size_t new_size);
static int set_size(git_hashtable *self, size_t new_size);
static git_hashtable_node *node_with_hash(git_hashtable *self, const void *key, int hash_id);
static void node_swap_with(git_hashtable_node *self, git_hashtable_node *other);
static int node_insert(git_hashtable *self, git_hashtable_node *new_node);
static int insert_nodes(git_hashtable *self, git_hashtable_node *old_nodes, size_t old_size);
static int resize_to(git_hashtable *self, size_t new_size)
{
git_hashtable_node **new_nodes;
unsigned int new_size, i;
git_hashtable_node *old_nodes = self->nodes;
size_t old_size = self->size;
self->is_resizing = 1;
assert(table);
do {
self->size = new_size;
self->size_mask = new_size - 1;
self->key_count = 0;
self->nodes = git__calloc(1, sizeof(git_hashtable_node) * self->size);
new_size = (table->size_mask + 1) * 2;
if (self->nodes == NULL)
return GIT_ENOMEM;
new_nodes = git__malloc(new_size * sizeof(git_hashtable_node *));
if (new_nodes == NULL)
return;
if (insert_nodes(self, old_nodes, old_size) == 0)
self->is_resizing = 0;
else {
new_size *= 2;
free(self->nodes);
}
} while(self->is_resizing);
free(old_nodes);
return GIT_SUCCESS;
}
memset(new_nodes, 0x0, new_size * sizeof(git_hashtable_node *));
static int set_size(git_hashtable *self, size_t new_size)
{
self->nodes = realloc(self->nodes, new_size * sizeof(git_hashtable_node));
if (self->nodes == NULL)
return GIT_ENOMEM;
for (i = 0; i <= table->size_mask; ++i) {
git_hashtable_node *n;
unsigned int index;
if (new_size > self->size) {
memset(&self->nodes[self->size], 0x0, (new_size - self->size) * sizeof(git_hashtable_node));
}
while ((n = table->nodes[i]) != NULL) {
table->nodes[i] = n->next;
index = n->hash & (new_size - 1);
n->next = new_nodes[index];
new_nodes[index] = n;
self->size = new_size;
self->size_mask = new_size - 1;
return GIT_SUCCESS;
}
static git_hashtable_node *node_with_hash(git_hashtable *self, const void *key, int hash_id)
{
size_t pos = self->hash(key, hash_id) & self->size_mask;
return git_hashtable_node_at(self->nodes, pos);
}
static void node_swap_with(git_hashtable_node *self, git_hashtable_node *other)
{
git_hashtable_node tmp = *self;
*self = *other;
*other = tmp;
}
static int node_insert(git_hashtable *self, git_hashtable_node *new_node)
{
int iteration, hash_id;
for (iteration = 0; iteration < MAX_LOOPS; iteration++) {
for (hash_id = 0; hash_id < GIT_HASHTABLE_HASHES; ++hash_id) {
git_hashtable_node *node;
node = node_with_hash(self, new_node->key, hash_id);
node_swap_with(new_node, node);
if(new_node->key == 0x0){
self->key_count++;
return GIT_SUCCESS;
}
}
}
free(table->nodes);
table->nodes = new_nodes;
table->size_mask = (new_size - 1);
table->max_count = (unsigned int)(new_size * max_load_factor);
if (self->is_resizing)
return GIT_EBUSY;
resize_to(self, self->size * 2);
git_hashtable_insert(self, new_node->key, new_node->value);
return GIT_SUCCESS;
}
static int insert_nodes(git_hashtable *self, git_hashtable_node *old_nodes, size_t old_size)
{
size_t i;
for (i = 0; i < old_size; ++i) {
git_hashtable_node *node = git_hashtable_node_at(old_nodes, i);
if (node->key && git_hashtable_insert(self, node->key, node->value) < GIT_SUCCESS)
return GIT_ENOMEM;
}
return GIT_SUCCESS;
}
git_hashtable *git_hashtable_alloc(unsigned int min_size,
git_hashtable *git_hashtable_alloc(size_t min_size,
git_hash_ptr hash,
git_hash_keyeq_ptr key_eq)
{
unsigned int i;
git_hashtable *table;
assert(hash && key_eq);
......@@ -74,6 +141,11 @@ git_hashtable *git_hashtable_alloc(unsigned int min_size,
if ((table = git__malloc(sizeof(git_hashtable))) == NULL)
return NULL;
memset(table, 0x0, sizeof(git_hashtable));
if (min_size < 8)
min_size = 8;
/* round up size to closest power of 2 */
min_size--;
min_size |= min_size >> 1;
......@@ -82,167 +154,90 @@ git_hashtable *git_hashtable_alloc(unsigned int min_size,
min_size |= min_size >> 8;
min_size |= min_size >> 16;
table->size_mask = min_size;
table->count = 0;
table->max_count = (unsigned int)((min_size + 1) * max_load_factor);
table->hash = hash;
table->key_equal = key_eq;
table->nodes = git__malloc((min_size + 1) * sizeof(git_hashtable_node *));
if (table->nodes == NULL) {
free(table);
return NULL;
}
for (i = 0; i <= min_size; ++i)
table->nodes[i] = NULL;
set_size(table, min_size + 1);
return table;
}
void git_hashtable_clear(git_hashtable *table)
void git_hashtable_clear(git_hashtable *self)
{
unsigned int index;
assert(table);
for (index = 0; index <= table->size_mask; ++index) {
git_hashtable_node *node, *next_node;
node = table->nodes[index];
while (node != NULL) {
next_node = node->next;
free(node);
node = next_node;
}
assert(self);
table->nodes[index] = NULL;
}
table->count = 0;
memset(self->nodes, 0x0, sizeof(git_hashtable_node) * self->size);
self->key_count = 0;
}
void git_hashtable_free(git_hashtable *table)
void git_hashtable_free(git_hashtable *self)
{
assert(table);
assert(self);
git_hashtable_clear(table);
free(table->nodes);
free(table);
free(self->nodes);
free(self);
}
int git_hashtable_insert(git_hashtable *table, const void *key, void *value)
int git_hashtable_insert(git_hashtable *self, const void *key, void *value)
{
int hash_id;
git_hashtable_node *node;
uint32_t index, hash;
assert(table);
if (table->count + 1 > table->max_count)
hashtable_resize(table);
node = git__malloc(sizeof(git_hashtable_node));
if (node == NULL)
return GIT_ENOMEM;
hash = table->hash(key);
index = (hash & table->size_mask);
for (hash_id = 0; hash_id < GIT_HASHTABLE_HASHES; ++hash_id) {
node = node_with_hash(self, key, hash_id);
node->object = value;
node->hash = hash;
node->next = table->nodes[index];
if (!node->key) {
node->key = key;
node->value = value;
self->key_count++;
return GIT_SUCCESS;
}
table->nodes[index] = node;
table->count++;
if (key == node->key || self->key_equal(key, node->key) == 0) {
node->value = value;
return GIT_SUCCESS;
}
}
return GIT_SUCCESS;
/* no space in table; must do cuckoo dance */
{
git_hashtable_node x;
x.key = key;
x.value = value;
return node_insert(self, &x);
}
}
void *git_hashtable_lookup(git_hashtable *table, const void *key)
void *git_hashtable_lookup(git_hashtable *self, const void *key)
{
int hash_id;
git_hashtable_node *node;
uint32_t index, hash;
assert(table);
hash = table->hash(key);
index = (hash & table->size_mask);
node = table->nodes[index];
while (node != NULL) {
if (node->hash == hash && table->key_equal(node->object, key))
return node->object;
node = node->next;
for (hash_id = 0; hash_id < GIT_HASHTABLE_HASHES; ++hash_id) {
node = node_with_hash(self, key, hash_id);
if (node->key && self->key_equal(key, node->key) == 0)
return node->value;
}
return NULL;
}
int git_hashtable_remove(git_hashtable *table, const void *key)
int git_hashtable_remove(git_hashtable *self, const void *key)
{
git_hashtable_node *node, *prev_node;
uint32_t index, hash;
assert(table);
hash = table->hash(key);
index = (hash & table->size_mask);
node = table->nodes[index];
prev_node = NULL;
while (node != NULL) {
if (node->hash == hash && table->key_equal(node->object, key)) {
if (prev_node == NULL)
table->nodes[index] = node->next;
else
prev_node->next = node->next;
int hash_id;
git_hashtable_node *node;
free(node);
for (hash_id = 0; hash_id < GIT_HASHTABLE_HASHES; ++hash_id) {
node = node_with_hash(self, key, hash_id);
if (node->key && self->key_equal(key, node->key) == 0) {
node->key = NULL;
node->value = NULL;
self->key_count--;
return GIT_SUCCESS;
}
prev_node = node;
node = node->next;
}
return GIT_ENOTFOUND;
}
void git_hashtable_iterator_init(git_hashtable *table, git_hashtable_iterator *it)
{
assert(table && it);
memset(it, 0x0, sizeof(git_hashtable_iterator));
it->nodes = table->nodes;
it->current_node = NULL;
it->current_pos = 0;
it->size = table->size_mask + 1;
}
void *git_hashtable_iterator_next(git_hashtable_iterator *it)
{
git_hashtable_node *next = NULL;
assert(it);
while (it->current_node == NULL) {
if (it->current_pos >= it->size)
return NULL;
it->current_node = it->nodes[it->current_pos++];
}
next = it->current_node;
it->current_node = it->current_node->next;
return next->object;
}
......@@ -5,39 +5,33 @@
#include "git2/oid.h"
#include "git2/odb.h"
#define GIT_HASHTABLE_HASHES 3
typedef uint32_t (*git_hash_ptr)(const void *);
typedef int (*git_hash_keyeq_ptr)(void *obj, const void *obj_key);
typedef uint32_t (*git_hash_ptr)(const void *, int hash_id);
typedef int (*git_hash_keyeq_ptr)(const void *key_a, const void *key_b);
struct git_hashtable_node {
void *object;
uint32_t hash;
struct git_hashtable_node *next;
const void *key;
void *value;
};
struct git_hashtable {
struct git_hashtable_node **nodes;
struct git_hashtable_node *nodes;
unsigned int size_mask;
unsigned int count;
unsigned int max_count;
size_t size_mask;
size_t size;
size_t key_count;
int is_resizing;
git_hash_ptr hash;
git_hash_keyeq_ptr key_equal;
};
struct git_hashtable_iterator {
struct git_hashtable_node **nodes;
struct git_hashtable_node *current_node;
unsigned int current_pos;
unsigned int size;
};
typedef struct git_hashtable_node git_hashtable_node;
typedef struct git_hashtable git_hashtable;
typedef struct git_hashtable_iterator git_hashtable_iterator;
git_hashtable *git_hashtable_alloc(unsigned int min_size,
git_hashtable *git_hashtable_alloc(size_t min_size,
git_hash_ptr hash,
git_hash_keyeq_ptr key_eq);
int git_hashtable_insert(git_hashtable *h, const void *key, void *value);
......@@ -46,7 +40,22 @@ int git_hashtable_remove(git_hashtable *table, const void *key);
void git_hashtable_free(git_hashtable *h);
void git_hashtable_clear(git_hashtable *h);
void *git_hashtable_iterator_next(git_hashtable_iterator *it);
void git_hashtable_iterator_init(git_hashtable *h, git_hashtable_iterator *it);
#define git_hashtable_node_at(nodes, pos) ((git_hashtable_node *)(&nodes[pos]))
#define GIT_HASHTABLE_FOREACH(self, pkey, pvalue, code) {\
git_hashtable *_self = (self);\
git_hashtable_node *_nodes = _self->nodes;\
unsigned int _i, _size = _self->size;\
for (_i = 0; _i < _size; _i ++) {\
git_hashtable_node *_node = git_hashtable_node_at(_nodes, _i);\
if (_node->key)\
{\
pkey = _node->key;\
pvalue = _node->value;\
code;\
}\
}\
}
#endif
......@@ -28,28 +28,21 @@
#include "repository.h"
#include "fileops.h"
#define HASH_SEED 2147483647
#define MAX_NESTING_LEVEL 5
static const int default_table_size = 32;
static uint32_t reftable_hash(const void *key)
static uint32_t reftable_hash(const void *key, int hash_id)
{
return git__hash(key, strlen((const char *)key), HASH_SEED);
}
static int reftable_haskey(void *reference, const void *key)
{
git_reference *ref;
char *name;
static uint32_t hash_seeds[GIT_HASHTABLE_HASHES] = {
2147483647,
0x5d20bb23,
0x7daaab3c
};
ref = (git_reference *)reference;
name = (char *)key;
return strcmp(name, ref->name) == 0;
return git__hash(key, strlen((const char *)key), hash_seeds[hash_id]);
}
static int check_refname(const char *name)
{
/*
......@@ -641,24 +634,21 @@ int git_repository__refcache_init(git_refcache *refs)
refs->cache = git_hashtable_alloc(
default_table_size,
reftable_hash,
reftable_haskey);
(git_hash_keyeq_ptr)strcmp);
return refs->cache ? GIT_SUCCESS : GIT_ENOMEM;
}
void git_repository__refcache_free(git_refcache *refs)
{
git_hashtable_iterator it;
const char *ref_name;
git_reference *reference;
assert(refs);
git_hashtable_iterator_init(refs->cache, &it);
while ((reference = (git_reference *)git_hashtable_iterator_next(&it)) != NULL) {
git_hashtable_remove(refs->cache, reference->name);
reference_free(reference);
}
GIT_HASHTABLE_FOREACH(refs->cache, ref_name, reference,
reference_free(reference)
);
git_hashtable_free(refs->cache);
}
......
......@@ -58,28 +58,16 @@ typedef struct {
* Callbacks for the ODB cache, implemented
* as a hash table
*/
uint32_t object_table_hash(const void *key)
uint32_t object_table_hash(const void *key, int hash_id)
{
uint32_t r;
git_oid *id;
id = (git_oid *)key;
memcpy(&r, id->id, sizeof(r));
memcpy(&r, id->id + (hash_id * sizeof(uint32_t)), sizeof(r));
return r;
}
int object_table_hashkey(void *object, const void *key)
{
git_object *obj;
git_oid *oid;
obj = (git_object *)object;
oid = (git_oid *)key;
return (git_oid_cmp(oid, &obj->id) == 0);
}
/*
* Git repository open methods
*
......@@ -245,9 +233,9 @@ static git_repository *repository_alloc()
repo->objects = git_hashtable_alloc(
OBJECT_TABLE_SIZE,
object_table_hash,
object_table_hashkey);
(git_hash_keyeq_ptr)git_oid_cmp);
if (repo->objects == NULL) {
if (repo->objects == NULL) {
free(repo);
return NULL;
}
......@@ -369,8 +357,8 @@ cleanup:
void git_repository_free(git_repository *repo)
{
git_hashtable_iterator it;
git_object *object;
const git_oid *oid;
if (repo == NULL)
return;
......@@ -380,11 +368,9 @@ void git_repository_free(git_repository *repo)
free(repo->path_repository);
free(repo->path_odb);
git_hashtable_iterator_init(repo->objects, &it);
while ((object = (git_object *)
git_hashtable_iterator_next(&it)) != NULL)
GIT_HASHTABLE_FOREACH(repo->objects, oid, object, {
git_object_free(object);
});
git_hashtable_free(repo->objects);
......
......@@ -28,28 +28,23 @@
#include "revwalk.h"
#include "hashtable.h"
uint32_t git_revwalk__commit_hash(const void *key)
uint32_t git_revwalk__commit_hash(const void *key, int hash_id)
{
uint32_t r;
git_commit *commit;
commit = (git_commit *)key;
memcpy(&r, commit->object.id.id, sizeof(r));
memcpy(&r, commit->object.id.id + (hash_id * sizeof(uint32_t)), sizeof(r));
return r;
}
int git_revwalk__commit_haskey(void *object, const void *key)
int git_revwalk__commit_keycmp(const void *key_a, const void *key_b)
{
git_revwalk_commit *walk_commit;
git_commit *commit_object;
walk_commit = (git_revwalk_commit *)object;
commit_object = (git_commit *)key;
return (walk_commit->commit_object == commit_object);
git_commit *a = (git_commit *)key_a;
git_commit *b = (git_commit *)key_b;
return git_oid_cmp(&a->object.id, &b->object.id);
}
int git_revwalk_new(git_revwalk **revwalk_out, git_repository *repo)
{
git_revwalk *walk;
......@@ -62,7 +57,7 @@ int git_revwalk_new(git_revwalk **revwalk_out, git_repository *repo)
walk->commits = git_hashtable_alloc(64,
git_revwalk__commit_hash,
git_revwalk__commit_haskey);
git_revwalk__commit_keycmp);
if (walk->commits == NULL) {
free(walk);
......@@ -245,18 +240,15 @@ int git_revwalk_next(git_commit **commit, git_revwalk *walk)
void git_revwalk_reset(git_revwalk *walk)
{
git_hashtable_iterator it;
const void *_unused;
git_revwalk_commit *commit;
assert(walk);
git_hashtable_iterator_init(walk->commits, &it);
while ((commit = (git_revwalk_commit *)
git_hashtable_iterator_next(&it)) != NULL) {
GIT_HASHTABLE_FOREACH(walk->commits, _unused, commit, {
git_revwalk_list_clear(&commit->parents);
free(commit);
}
});
git_hashtable_clear(walk->commits);
git_revwalk_list_clear(&walk->iterator);
......
......@@ -34,32 +34,26 @@ typedef struct _aux_object {
int visited;
} table_item;
uint32_t hash_func(const void *key)
uint32_t hash_func(const void *key, int hash_id)
{
uint32_t r;
git_oid *id;
id = (git_oid *)key;
memcpy(&r, id->id, sizeof(r));
memcpy(&r, id->id + (hash_id * sizeof(uint32_t)), sizeof(r));
return r;
}
int hash_haskey(void *item, const void *key)
int hash_cmpkey(const void *a, const void *b)
{
table_item *obj;
git_oid *oid;
obj = (table_item *)item;
oid = (git_oid *)key;
return (git_oid_cmp(oid, &obj->id) == 0);
return git_oid_cmp(a, b);
}
BEGIN_TEST("table", table_create)
git_hashtable *table = NULL;
table = git_hashtable_alloc(55, hash_func, hash_haskey);
table = git_hashtable_alloc(55, hash_func, hash_cmpkey);
must_be_true(table != NULL);
must_be_true(table->size_mask + 1 == 64);
......@@ -75,7 +69,7 @@ BEGIN_TEST("table", table_populate)
table_item *objects;
git_hashtable *table = NULL;
table = git_hashtable_alloc(objects_n * 2, hash_func, hash_haskey);
table = git_hashtable_alloc(objects_n * 2, hash_func, hash_cmpkey);
must_be_true(table != NULL);
objects = git__malloc(objects_n * sizeof(table_item));
......@@ -123,7 +117,7 @@ BEGIN_TEST("table", table_resize)
table_item *objects;
git_hashtable *table = NULL;
table = git_hashtable_alloc(objects_n, hash_func, hash_haskey);
table = git_hashtable_alloc(objects_n, hash_func, hash_cmpkey);
must_be_true(table != NULL);
objects = git__malloc(objects_n * sizeof(table_item));
......@@ -161,11 +155,11 @@ BEGIN_TEST("tableit", table_iterator)
const int objects_n = 32;
int i;
table_item *objects, *ob;
const void *_unused;
git_hashtable *table = NULL;
git_hashtable_iterator iterator;
table = git_hashtable_alloc(objects_n * 2, hash_func, hash_haskey);
table = git_hashtable_alloc(objects_n * 2, hash_func, hash_cmpkey);
must_be_true(table != NULL);
objects = git__malloc(objects_n * sizeof(table_item));
......@@ -177,11 +171,9 @@ BEGIN_TEST("tableit", table_iterator)
must_pass(git_hashtable_insert(table, &(objects[i].id), &(objects[i])));
}
git_hashtable_iterator_init(table, &iterator);
/* iterate through all nodes, mark as visited */
while ((ob = (table_item *)git_hashtable_iterator_next(&iterator)) != NULL)
GIT_HASHTABLE_FOREACH(table, _unused, ob,
ob->visited = 1;
);
/* make sure all nodes have been visited */
for (i = 0; i < objects_n; ++i)
......@@ -189,7 +181,6 @@ BEGIN_TEST("tableit", table_iterator)
git_hashtable_free(table);
free(objects);
END_TEST
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment