Commit 71db842f by Vicent Marti

Rewrite the Revision Walker

The new revision walker uses an internal Commit object storage system,
custom memory allocator and much improved topological and time sorting
algorithms. It's about 20x times faster than the previous implementation
when browsing big repositories.

The following external API calls have changed:

	`git_revwalk_next` returns an OID instead of a full commit object.
	The initial call to `git_revwalk_next` is no longer blocking when
	iterating through a repo with a time-sorting mode.

	Iterating with Topological or inverted modes still makes the initial
	call blocking to preprocess the commit list, but this block should be
	mostly unnoticeable on most repositories (topological preprocessing
	times at 0.3s on the git.git repo).

	`git_revwalk_push` and `git_revwalk_hide` now take an OID instead
	of a full commit object.
parent 26022f07
......@@ -27,6 +27,7 @@
#include "common.h"
#include "types.h"
#include "object.h"
/**
* @file git2/revwalk.h
......@@ -88,14 +89,15 @@ GIT_EXTERN(void) git_revwalk_reset(git_revwalk *walker);
* @param walker the walker being used for the traversal.
* @param commit the commit to start from.
*/
GIT_EXTERN(int) git_revwalk_push(git_revwalk *walk, git_commit *commit);
GIT_EXTERN(int) git_revwalk_push(git_revwalk *walk, const git_oid *oid);
/**
* Mark a commit (and its ancestors) uninteresting for the output.
* @param walker the walker being used for the traversal.
* @param commit the commit that will be ignored during the traversal
*/
GIT_EXTERN(int) git_revwalk_hide(git_revwalk *walk, git_commit *commit);
GIT_EXTERN(int) git_revwalk_hide(git_revwalk *walk, const git_oid *oid);
/**
* Get the next commit from the revision traversal.
......@@ -105,7 +107,7 @@ GIT_EXTERN(int) git_revwalk_hide(git_revwalk *walk, git_commit *commit);
* @return GIT_SUCCESS if the next commit was found;
* GIT_EREVWALKOVER if there are no commits left to iterate
*/
GIT_EXTERN(int) git_revwalk_next(git_commit **commit, git_revwalk *walk);
GIT_EXTERN(int) git_revwalk_next(git_oid *oid, git_revwalk *walk);
/**
* Change the sorting mode when iterating through the
......
/*
* BORING COPYRIGHT NOTICE:
*
* This file is a heavily modified version of the priority queue found
* in the Apache project and the libpqueue library.
*
* https://github.com/vy/libpqueue
*
* These are the original authors:
*
* Copyright 2010 Volkan Yazıcı <volkan.yazici@gmail.com>
* Copyright 2006-2010 The Apache Software Foundation
*
* This file is licensed under the Apache 2.0 license, which
* supposedly makes it compatible with the GPLv2 that libgit2 uses.
*
* Check the Apache license at:
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* So much licensing trouble for a binary heap. Oh well.
*/
#include "common.h"
#include "pqueue.h"
#define left(i) ((i) << 1)
#define right(i) (((i) << 1) + 1)
#define parent(i) ((i) >> 1)
int git_pqueue_init(git_pqueue *q, size_t n, git_pqueue_cmp cmppri)
{
assert(q);
/* Need to allocate n+1 elements since element 0 isn't used. */
if ((q->d = malloc((n + 1) * sizeof(void *))) == NULL)
return GIT_ENOMEM;
q->size = 1;
q->avail = q->step = (n + 1); /* see comment above about n+1 */
q->cmppri = cmppri;
return GIT_SUCCESS;
}
void git_pqueue_free(git_pqueue *q)
{
free(q->d);
q->d = NULL;
}
size_t git_pqueue_size(git_pqueue *q)
{
/* queue element 0 exists but doesn't count since it isn't used. */
return (q->size - 1);
}
static void bubble_up(git_pqueue *q, size_t i)
{
size_t parent_node;
void *moving_node = q->d[i];
for (parent_node = parent(i);
((i > 1) && q->cmppri(q->d[parent_node], moving_node));
i = parent_node, parent_node = parent(i)) {
q->d[i] = q->d[parent_node];
}
q->d[i] = moving_node;
}
static size_t maxchild(git_pqueue *q, size_t i)
{
size_t child_node = left(i);
if (child_node >= q->size)
return 0;
if ((child_node + 1) < q->size &&
q->cmppri(q->d[child_node], q->d[child_node + 1]))
child_node++; /* use right child instead of left */
return child_node;
}
static void percolate_down(git_pqueue *q, size_t i)
{
size_t child_node;
void *moving_node = q->d[i];
while ((child_node = maxchild(q, i)) != 0 &&
q->cmppri(moving_node, q->d[child_node])) {
q->d[i] = q->d[child_node];
i = child_node;
}
q->d[i] = moving_node;
}
int git_pqueue_insert(git_pqueue *q, void *d)
{
void *tmp;
size_t i;
size_t newsize;
if (!q) return 1;
/* allocate more memory if necessary */
if (q->size >= q->avail) {
newsize = q->size + q->step;
if ((tmp = realloc(q->d, sizeof(void *) * newsize)) == NULL)
return GIT_ENOMEM;
q->d = tmp;
q->avail = newsize;
}
/* insert item */
i = q->size++;
q->d[i] = d;
bubble_up(q, i);
return GIT_SUCCESS;
}
void *git_pqueue_pop(git_pqueue *q)
{
void *head;
if (!q || q->size == 1)
return NULL;
head = q->d[1];
q->d[1] = q->d[--q->size];
percolate_down(q, 1);
return head;
}
void *git_pqueue_peek(git_pqueue *q)
{
if (!q || q->size == 1)
return NULL;
return q->d[1];
}
/*
* BORING COPYRIGHT NOTICE:
*
* This file is a heavily modified version of the priority queue found
* in the Apache project and the libpqueue library.
*
* https://github.com/vy/libpqueue
*
* These are the original authors:
*
* Copyright 2010 Volkan Yazıcı <volkan.yazici@gmail.com>
* Copyright 2006-2010 The Apache Software Foundation
*
* This file is licensed under the Apache 2.0 license, which
* supposedly makes it compatible with the GPLv2 that libgit2 uses.
*
* Check the Apache license at:
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* So much licensing trouble for a binary heap. Oh well.
*/
#ifndef INCLUDE_pqueue_h__
#define INCLUDE_pqueue_h__
/** callback functions to get/set/compare the priority of an element */
typedef int (*git_pqueue_cmp)(void *a, void *b);
/** the priority queue handle */
typedef struct {
size_t size, avail, step;
git_pqueue_cmp cmppri;
void **d;
} git_pqueue;
/**
* initialize the queue
*
* @param n the initial estimate of the number of queue items for which memory
* should be preallocated
* @param cmppri the callback function to compare two nodes of the queue
*
* @Return the handle or NULL for insufficent memory
*/
int git_pqueue_init(git_pqueue *q, size_t n, git_pqueue_cmp cmppri);
/**
* free all memory used by the queue
* @param q the queue
*/
void git_pqueue_free(git_pqueue *q);
/**
* return the size of the queue.
* @param q the queue
*/
size_t git_pqueue_size(git_pqueue *q);
/**
* insert an item into the queue.
* @param q the queue
* @param d the item
* @return 0 on success
*/
int git_pqueue_insert(git_pqueue *q, void *d);
/**
* pop the highest-ranking item from the queue.
* @param p the queue
* @param d where to copy the entry to
* @return NULL on error, otherwise the entry
*/
void *git_pqueue_pop(git_pqueue *q);
/**
* access highest-ranking item without removing it.
* @param q the queue
* @param d the entry
* @return NULL on error, otherwise the entry
*/
void *git_pqueue_peek(git_pqueue *q);
#endif /* PQUEUE_H */
/** @} */
......@@ -53,7 +53,7 @@ typedef struct {
* Callbacks for the ODB cache, implemented
* as a hash table
*/
uint32_t object_table_hash(const void *key, int hash_id)
static uint32_t object_table_hash(const void *key, int hash_id)
{
uint32_t r;
git_oid *id;
......
......@@ -8,60 +8,4 @@
#include "repository.h"
#include "hashtable.h"
struct git_revwalk_commit;
typedef struct git_revwalk_listnode {
struct git_revwalk_commit *walk_commit;
struct git_revwalk_listnode *next;
struct git_revwalk_listnode *prev;
} git_revwalk_listnode;
typedef struct git_revwalk_list {
struct git_revwalk_listnode *head;
struct git_revwalk_listnode *tail;
size_t size;
} git_revwalk_list;
struct git_revwalk_commit {
git_commit *commit_object;
git_revwalk_list parents;
unsigned short in_degree;
unsigned seen:1,
uninteresting:1,
topo_delay:1,
flags:25;
};
typedef struct git_revwalk_commit git_revwalk_commit;
struct git_revwalk {
git_repository *repo;
git_hashtable *commits;
git_revwalk_list iterator;
git_revwalk_commit *(*next)(git_revwalk_list *);
unsigned walking:1;
unsigned int sorting;
};
void git_revwalk__prepare_walk(git_revwalk *walk);
int git_revwalk__enroot(git_revwalk *walk, git_commit *commit);
int git_revwalk_list_push_back(git_revwalk_list *list, git_revwalk_commit *commit);
int git_revwalk_list_push_front(git_revwalk_list *list, git_revwalk_commit *obj);
git_revwalk_commit *git_revwalk_list_pop_back(git_revwalk_list *list);
git_revwalk_commit *git_revwalk_list_pop_front(git_revwalk_list *list);
void git_revwalk_list_clear(git_revwalk_list *list);
void git_revwalk_list_timesort(git_revwalk_list *list);
void git_revwalk_list_toposort(git_revwalk_list *list);
#endif /* INCLUDE_revwalk_h__ */
......@@ -93,6 +93,8 @@ GIT_INLINE(int) git__is_sizet(git_off_t p)
extern char *git__strtok(char *output, char *src, char *delimit);
extern char *git__strtok_keep(char *output, char *src, char *delimit);
#define STRLEN(str) (sizeof(str) - 1)
/*
* Realloc the buffer pointed at by variable 'x' so that it can hold
* at least 'nr' entries; the number of entries currently allocated
......
......@@ -70,12 +70,12 @@ static const int commit_sorting_time_reverse[][6] = {
static const int result_bytes = 24;
static int get_commit_index(git_commit *commit)
static int get_commit_index(git_oid *raw_oid)
{
int i;
char oid[40];
git_oid_fmt(oid, &commit->object.id);
git_oid_fmt(oid, raw_oid);
for (i = 0; i < commit_count; ++i)
if (memcmp(oid, commit_ids[i], 40) == 0)
......@@ -84,23 +84,31 @@ static int get_commit_index(git_commit *commit)
return -1;
}
static int test_walk(git_revwalk *walk, git_commit *start_from,
static int test_walk(git_revwalk *walk,
int flags, const int possible_results[][6], int results_count)
{
git_commit *commit = NULL;
git_oid oid;
int i;
int result_array[commit_count];
git_revwalk_reset(walk);
git_revwalk_sorting(walk, flags);
git_revwalk_push(walk, start_from);
for (i = 0; i < commit_count; ++i)
result_array[i] = -1;
i = 0;
while (git_revwalk_next(&commit, walk) == GIT_SUCCESS)
result_array[i++] = get_commit_index(commit);
while (git_revwalk_next(&oid, walk) == GIT_SUCCESS) {
result_array[i++] = get_commit_index(&oid);
/*{
char str[41];
git_oid_fmt(str, &oid);
str[40] = 0;
printf(" %d) %s\n", i, str);
}*/
}
for (i = 0; i < results_count; ++i)
if (memcmp(possible_results[i],
......@@ -114,103 +122,26 @@ BEGIN_TEST(walk0, "do a simple walk on a repo with different sorting modes")
git_oid id;
git_repository *repo;
git_revwalk *walk;
git_commit *head = NULL;
must_pass(git_repository_open(&repo, REPOSITORY_FOLDER));
must_pass(git_revwalk_new(&walk, repo));
git_oid_mkstr(&id, commit_head);
git_revwalk_push(walk, &id);
must_pass(git_commit_lookup(&head, repo, &id));
must_pass(test_walk(walk, head,
GIT_SORT_TIME,
commit_sorting_time, 1));
must_pass(test_walk(walk, head,
GIT_SORT_TOPOLOGICAL,
commit_sorting_topo, 2));
must_pass(test_walk(walk, GIT_SORT_TIME, commit_sorting_time, 1));
must_pass(test_walk(walk, head,
GIT_SORT_TIME | GIT_SORT_REVERSE,
commit_sorting_time_reverse, 1));
must_pass(test_walk(walk, GIT_SORT_TOPOLOGICAL, commit_sorting_topo, 2));
must_pass(test_walk(walk, head,
GIT_SORT_TOPOLOGICAL | GIT_SORT_REVERSE,
commit_sorting_topo_reverse, 2));
must_pass(test_walk(walk, GIT_SORT_TIME | GIT_SORT_REVERSE, commit_sorting_time_reverse, 1));
must_pass(test_walk(walk, GIT_SORT_TOPOLOGICAL | GIT_SORT_REVERSE, commit_sorting_topo_reverse, 2));
git_revwalk_free(walk);
git_repository_free(repo);
END_TEST
BEGIN_TEST(list0, "check that a commit list is properly sorted by time")
git_revwalk_list list;
git_revwalk_listnode *n;
int i, t;
time_t previous_time;
#define TEST_SORTED() \
previous_time = INT_MAX;\
for (n = list.head; n != NULL; n = n->next) {\
must_be_true(n->walk_commit->commit_object->committer->when.time <= previous_time);\
previous_time = n->walk_commit->commit_object->committer->when.time;\
}
#define CLEAR_LIST() \
for (n = list.head; n != NULL; n = n->next) {\
git_signature_free(n->walk_commit->commit_object->committer);\
free(n->walk_commit->commit_object);\
free(n->walk_commit);\
}\
git_revwalk_list_clear(&list);
memset(&list, 0x0, sizeof(git_revwalk_list));
srand((unsigned int)time(NULL));
for (t = 0; t < 20; ++t) {
const int test_size = rand() % 500 + 500;
/* Purely random sorting test */
for (i = 0; i < test_size; ++i) {
git_commit *c = git__malloc(sizeof(git_commit));
git_revwalk_commit *rc = git__malloc(sizeof(git_revwalk_commit));
c->committer = git_signature_new("", "", (time_t)rand(), 0);
rc->commit_object = c;
git_revwalk_list_push_back(&list, rc);
}
git_revwalk_list_timesort(&list);
TEST_SORTED();
CLEAR_LIST();
}
/* Try to sort list with all dates equal. */
for (i = 0; i < 200; ++i) {
git_commit *c = git__malloc(sizeof(git_commit));
git_revwalk_commit *rc = git__malloc(sizeof(git_revwalk_commit));
c->committer = git_signature_new("", "", 0, 0);
rc->commit_object = c;
git_revwalk_list_push_back(&list, rc);
}
git_revwalk_list_timesort(&list);
TEST_SORTED();
CLEAR_LIST();
/* Try to sort empty list */
git_revwalk_list_timesort(&list);
TEST_SORTED();
END_TEST
BEGIN_SUITE(revwalk)
ADD_TEST(walk0);
ADD_TEST(list0);
END_SUITE
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment