Commit d8896bda by Patrick Steinhardt

diff_generate: avoid excessive stats of .gitattribute files

When generating a diff between two trees, for each file that is to be
diffed we have to determine whether it shall be treated as text or as
binary files. While git has heuristics to determine which kind of diff
to generate, users can also that default behaviour by setting or
unsetting the 'diff' attribute for specific files.

Because of that, we have to query gitattributes in order to determine
how to diff the current files. Instead of hitting the '.gitattributes'
file every time we need to query an attribute, which can get expensive
especially on networked file systems, we try to cache them instead. This
works perfectly fine for every '.gitattributes' file that is found, but
we hit cache invalidation problems when we determine that an attribuse
file is _not_ existing. We do create an entry in the cache for missing
'.gitattributes' files, but as soon as we hit that file again we
invalidate it and stat it again to see if it has now appeared.

In the case of diffing large trees with each other, this behaviour is
very suboptimal. For each pair of files that is to be diffed, we will
repeatedly query every directory component leading towards their
respective location for an attributes file. This leads to thousands or
even hundreds of thousands of wasted syscalls.

The attributes cache already has a mechanism to help in that scenario in
form of the `git_attr_session`. As long as the same attributes session
is still active, we will not try to re-query the gitmodules files at all
but simply retain our currently cached results. To fix our problem, we
can create a session at the top-most level, which is the initialization
of the `git_diff` structure, and use it in order to look up the correct
diff driver. As the `git_diff` structure is used to generate patches for
multiple files at once, this neatly solves our problem by retaining the
session until patches for all files have been generated.

The fix has been tested with linux.git by calling
`git_diff_tree_to_tree` and `git_diff_to_buf` with v4.10^{tree} and
v4.14^{tree}.

                | time    | .gitattributes stats
    without fix | 33.201s | 844614
    with fix    | 30.327s | 4441

While execution only improved by roughly 10%, the stat(3) syscalls for
.gitattributes files decreased by 99.5%. The benchmarks were quite
simple with best-of-three timings on Linux ext4 systems. One can assume
that for network based file systems the performance gain will be a lot
larger due to a much higher latency.
parent 7610638e
......@@ -34,6 +34,7 @@ typedef enum {
struct git_diff {
git_refcount rc;
git_repository *repo;
git_attr_session attrsession;
git_diff_origin_t type;
git_diff_options opts;
git_vector deltas; /* vector of git_diff_delta */
......
......@@ -354,27 +354,30 @@ done:
}
int git_diff_driver_lookup(
git_diff_driver **out, git_repository *repo, const char *path)
git_diff_driver **out, git_repository *repo,
git_attr_session *attrsession, const char *path)
{
int error = 0;
const char *value;
const char *values[1], *attrs[] = { "diff" };
assert(out);
*out = NULL;
if (!repo || !path || !strlen(path))
/* just use the auto value */;
else if ((error = git_attr_get(&value, repo, 0, path, "diff")) < 0)
else if ((error = git_attr_get_many_with_session(values, repo,
attrsession, 0, path, 1, attrs)) < 0)
/* return error below */;
else if (GIT_ATTR_UNSPECIFIED(value))
else if (GIT_ATTR_UNSPECIFIED(values[0]))
/* just use the auto value */;
else if (GIT_ATTR_FALSE(value))
else if (GIT_ATTR_FALSE(values[0]))
*out = &global_drivers[DIFF_DRIVER_BINARY];
else if (GIT_ATTR_TRUE(value))
else if (GIT_ATTR_TRUE(values[0]))
*out = &global_drivers[DIFF_DRIVER_TEXT];
/* otherwise look for driver information in config and build driver */
else if ((error = git_diff_driver_load(out, repo, value)) < 0) {
else if ((error = git_diff_driver_load(out, repo, values[0])) < 0) {
if (error == GIT_ENOTFOUND) {
error = 0;
giterr_clear();
......
......@@ -9,6 +9,7 @@
#include "common.h"
#include "attr_file.h"
#include "buffer.h"
typedef struct git_diff_driver_registry git_diff_driver_registry;
......@@ -18,7 +19,8 @@ void git_diff_driver_registry_free(git_diff_driver_registry *);
typedef struct git_diff_driver git_diff_driver;
int git_diff_driver_lookup(git_diff_driver **, git_repository *, const char *);
int git_diff_driver_lookup(git_diff_driver **, git_repository *,
git_attr_session *attrsession, const char *);
void git_diff_driver_free(git_diff_driver *);
/* diff option flags to force off and on for this driver */
......
......@@ -54,7 +54,8 @@ static int diff_file_content_init_common(
fc->src = GIT_ITERATOR_TYPE_TREE;
if (!fc->driver &&
git_diff_driver_lookup(&fc->driver, fc->repo, fc->file->path) < 0)
git_diff_driver_lookup(&fc->driver, fc->repo,
NULL, fc->file->path) < 0)
return -1;
/* give driver a chance to modify options */
......@@ -101,7 +102,8 @@ int git_diff_file_content__init_from_diff(
fc->file = use_old ? &delta->old_file : &delta->new_file;
fc->src = use_old ? diff->old_src : diff->new_src;
if (git_diff_driver_lookup(&fc->driver, fc->repo, fc->file->path) < 0)
if (git_diff_driver_lookup(&fc->driver, fc->repo,
&diff->attrsession, fc->file->path) < 0)
return -1;
switch (delta->status) {
......
......@@ -389,6 +389,7 @@ static void diff_generated_free(git_diff *d)
{
git_diff_generated *diff = (git_diff_generated *)d;
git_attr_session__free(&diff->base.attrsession);
git_vector_free_deep(&diff->base.deltas);
git_pathspec__vfree(&diff->pathspec);
......@@ -418,6 +419,7 @@ static git_diff_generated *diff_generated_alloc(
diff->base.new_src = new_iter->type;
diff->base.patch_fn = git_patch_generated_from_diff;
diff->base.free_fn = diff_generated_free;
git_attr_session__init(&diff->base.attrsession, repo);
memcpy(&diff->base.opts, &dflt, sizeof(git_diff_options));
git_pool_init(&diff->base.pool, 1);
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment