Commit 9d83a2b0 by Stan Hu

Sanitize the hunk header to ensure it contains UTF-8 valid data

The diff driver truncates the hunk header text to 80 bytes, which can truncate
4-byte Unicode characters and introduce garbage characters in the diff
output. This change sanitizes the hunk header before it is displayed.

This mirrors the test in git: https://github.com/git/git/blob/master/t/t4025-hunk-header.sh

Closes https://github.com/libgit2/rugged/issues/716
parent 0ad2372b
......@@ -6,6 +6,7 @@
*/
#include "diff_xdiff.h"
#include "util.h"
#include "git2/errors.h"
#include "diff.h"
......@@ -115,6 +116,7 @@ static int git_xdiff_cb(void *priv, mmbuffer_t *bufs, int len)
const git_diff_delta *delta = patch->base.delta;
git_patch_generated_output *output = &info->xo->output;
git_diff_line line;
size_t buffer_len;
if (len == 1) {
output->error = git_xdiff_parse_hunk(&info->hunk, bufs[0].ptr);
......@@ -124,6 +126,16 @@ static int git_xdiff_cb(void *priv, mmbuffer_t *bufs, int len)
info->hunk.header_len = bufs[0].size;
if (info->hunk.header_len >= sizeof(info->hunk.header))
info->hunk.header_len = sizeof(info->hunk.header) - 1;
/* Sanitize the hunk header in case there is invalid Unicode */
buffer_len = git__utf8_valid_buf_length((const uint8_t *) bufs[0].ptr, info->hunk.header_len);
/* Sanitizing the hunk header may delete the newline, so add it back again if there is room */
if (buffer_len < info->hunk.header_len) {
bufs[0].ptr[buffer_len] = '\n';
buffer_len += 1;
info->hunk.header_len = buffer_len;
}
memcpy(info->hunk.header, bufs[0].ptr, info->hunk.header_len);
info->hunk.header[info->hunk.header_len] = '\0';
......
......@@ -806,6 +806,22 @@ double git_time_monotonic(void)
return git__timer();
}
size_t git__utf8_valid_buf_length(const uint8_t *str, size_t str_len)
{
size_t offset = 0;
while (offset < str_len) {
int length = git__utf8_charlen(str + offset, str_len - offset);
if (length < 0)
break;
offset += length;
}
return offset;
}
#ifdef GIT_WIN32
int git__getenv(git_buf *out, const char *name)
{
......
......@@ -454,6 +454,16 @@ extern size_t git__unescape(char *str);
extern int git__utf8_iterate(const uint8_t *str, int str_len, int32_t *dst);
/*
* Iterate through an UTF-8 string and stops after finding any invalid UTF-8
* codepoints.
*
* @param str string to scan
* @param str_len size of the string
* @return length in bytes of the string that contains valid data
*/
extern size_t git__utf8_valid_buf_length(const uint8_t *str, size_t str_len);
/*
* Safely zero-out memory, making sure that the compiler
* doesn't optimize away the operation.
*/
......
......@@ -25,6 +25,12 @@ void test_diff_patch__cleanup(void)
#define EXPECTED_HUNK "@@ -1,2 +0,0 @@\n"
#define UTF8_HUNK_HEADER "\xE6\x97\xA5\xE6\x9C\xAC\xE8\xAA\x9E\xE6\x97\xA5\xE6\x9C\xAC\xE8\xAA\x9E\xE6\x97\xA5\xE6\x9C\xAC\xE8\xAA\x9E\xE6\x97\xA5\xE6\x9C\xAC\xE8\xAA\x9E\xE6\x97\xA5\xE6\x9C\xAC\xE8\xAA\x9E\xE6\x97\xA5\xE6\x9C\xAC\xE8\xAA\x9E\xE6\x97\xA5\xE6\x9C\xAC\xE8\xAA\x9E\xE6\x97\xA5\xE6\x9C\xAC\xE8\xAA\x9E\xE6\x97\xA5\xE6\x9C\xAC\xE8\xAA\x9E\xE6\x97\xA5\xE6\x9C\xAC\xE8\xAA\x9E\xE6\x97\xA5\xE6\x9C\xAC\xE8\xAA\x9E\xE6\x97\xA5\xE6\x9C\xAC\xE8\xAA\x9E\xE6\x97\xA5\xE6\x9C\xAC\xE8\xAA\x9E\n"
#define UTF8_TRUNCATED_A_HUNK_HEADER "\xE6\x97\xA5\xE6\x9C\xAC\xE8\xAA\x9E\xE6\x97\xA5\xE6\x9C\xAC\xE8\xAA\x9E\xE6\x97\xA5\xE6\x9C\xAC\xE8\xAA\x9E\xE6\x97\xA5\xE6\x9C\xAC\xE8\xAA\x9E\xE6\x97\xA5\xE6\x9C\xAC\xE8\xAA\x9E\xE6\x97\xA5\xE6\x9C\xAC\xE8\xAA\x9E\xE6\x97\xA5\xE6\x9C\xAC\xE8\xAA\x9E\xE6\x97\xA5\xE6\x9C\xAC\xE8\xAA\x9E\xE6\x97\xA5\xE6\x9C\xAC\n"
#define UTF8_TRUNCATED_L_HUNK_HEADER "\xE6\x97\xA5\xE6\x9C\xAC\xE8\xAA\x9E\xE6\x97\xA5\xE6\x9C\xAC\xE8\xAA\x9E\xE6\x97\xA5\xE6\x9C\xAC\xE8\xAA\x9E\xE6\x97\xA5\xE6\x9C\xAC\xE8\xAA\x9E\xE6\x97\xA5\xE6\x9C\xAC\xE8\xAA\x9E\xE6\x97\xA5\xE6\x9C\xAC\xE8\xAA\x9E\xE6\x97\xA5\xE6\x9C\xAC\xE8\xAA\x9E\xE6\x97\xA5\xE6\x9C\xAC\xE8\xAA\x9E\xE6\x97\xA5\n"
static int check_removal_cb(
const git_diff_delta *delta,
const git_diff_hunk *hunk,
......@@ -610,3 +616,89 @@ void test_diff_patch__line_counts_with_eofnl(void)
git_buf_free(&content);
}
void test_diff_patch__can_strip_bad_utf8(void)
{
const char *a = "A " UTF8_HUNK_HEADER
" B\n"
" C\n"
" D\n"
" E\n"
" F\n"
" G\n"
" H\n"
" I\n"
" J\n"
" K\n"
"L " UTF8_HUNK_HEADER
" M\n"
" N\n"
" O\n"
" P\n"
" Q\n"
" R\n"
" S\n"
" T\n"
" U\n"
" V\n";
const char *b = "A " UTF8_HUNK_HEADER
" B\n"
" C\n"
" D\n"
" E modified\n"
" F\n"
" G\n"
" H\n"
" I\n"
" J\n"
" K\n"
"L " UTF8_HUNK_HEADER
" M\n"
" N\n"
" O\n"
" P modified\n"
" Q\n"
" R\n"
" S\n"
" T\n"
" U\n"
" V\n";
const char *expected = "diff --git a/file b/file\n"
"index d0647c4..7827ce5 100644\n"
"--- a/file\n"
"+++ b/file\n"
"@@ -2,7 +2,7 @@ A " UTF8_TRUNCATED_A_HUNK_HEADER
" B\n"
" C\n"
" D\n"
"- E\n"
"+ E modified\n"
" F\n"
" G\n"
" H\n"
"@@ -13,7 +13,7 @@ L " UTF8_TRUNCATED_L_HUNK_HEADER
" M\n"
" N\n"
" O\n"
"- P\n"
"+ P modified\n"
" Q\n"
" R\n"
" S\n";
git_diff_options opts;
git_patch *patch;
git_buf buf = GIT_BUF_INIT;
cl_git_pass(git_diff_init_options(&opts, GIT_DIFF_OPTIONS_VERSION));
cl_git_pass(git_patch_from_buffers(&patch, a, strlen(a), NULL, b, strlen(b), NULL, &opts));
cl_git_pass(git_patch_to_buf(&buf, patch));
cl_assert_equal_s(expected, buf.ptr);
git_patch_free(patch);
git_buf_free(&buf);
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment