Commit 1d95b59b by Edward Thomson

utf8: refactor utf8 functions

Move the utf8 functions into a proper namespace `git_utf8` instead of
being in the namespaceless `git__` function group.  Update them to
have out-params first and use `char *` instead of `uint8_t *` to match
our API treating strings as `char *` (even if they truly contain `uchar`s
inside).
parent 4f4b1139
......@@ -83,6 +83,7 @@
#include "thread.h"
#include "integer.h"
#include "assert_safe.h"
#include "utf8.h"
/*
* Include the declarations for deprecated functions; this ensures
......
......@@ -6,7 +6,6 @@
*/
#include "diff_xdiff.h"
#include "util.h"
#include "git2/errors.h"
#include "diff.h"
......@@ -128,7 +127,7 @@ static int git_xdiff_cb(void *priv, mmbuffer_t *bufs, int len)
info->hunk.header_len = sizeof(info->hunk.header) - 1;
/* Sanitize the hunk header in case there is invalid Unicode */
buffer_len = git__utf8_valid_buf_length((const uint8_t *) bufs[0].ptr, info->hunk.header_len);
buffer_len = git_utf8_valid_buf_length(bufs[0].ptr, info->hunk.header_len);
/* Sanitizing the hunk header may delete the newline, so add it back again if there is room */
if (buffer_len < info->hunk.header_len) {
bufs[0].ptr[buffer_len] = '\n';
......
......@@ -1562,8 +1562,8 @@ GIT_INLINE(bool) verify_dospath(
static int32_t next_hfs_char(const char **in, size_t *len)
{
while (*len) {
int32_t codepoint;
int cp_len = git__utf8_iterate((const uint8_t *)(*in), (int)(*len), &codepoint);
uint32_t codepoint;
int cp_len = git_utf8_iterate(&codepoint, *in, *len);
if (cp_len < 0)
return -1;
......@@ -1595,7 +1595,7 @@ static int32_t next_hfs_char(const char **in, size_t *len)
* the ASCII range, which is perfectly fine, because the
* git folder name can only be composed of ascii characters
*/
return git__tolower(codepoint);
return git__tolower((int)codepoint);
}
return 0; /* NULL byte -- end of string */
}
......
/*
* Copyright (C) the libgit2 contributors. All rights reserved.
*
* This file is part of libgit2, distributed under the GNU GPL v2 with
* a Linking Exception. For full terms see the included COPYING file.
*/
#include "utf8.h"
#include "common.h"
/*
* git_utf8_iterate is taken from the utf8proc project,
* http://www.public-software-group.org/utf8proc
*
* Copyright (c) 2009 Public Software Group e. V., Berlin, Germany
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the ""Software""),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS IN THE SOFTWARE.
*/
static const uint8_t utf8proc_utf8class[256] = {
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
4, 4, 4, 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0
};
static int utf8_charlen(const uint8_t *str, size_t str_len)
{
uint8_t length;
size_t i;
length = utf8proc_utf8class[str[0]];
if (!length)
return -1;
if (str_len > 0 && length > str_len)
return -1;
for (i = 1; i < length; i++) {
if ((str[i] & 0xC0) != 0x80)
return -1;
}
return (int)length;
}
int git_utf8_iterate(uint32_t *out, const char *_str, size_t str_len)
{
const uint8_t *str = (const uint8_t *)_str;
uint32_t uc = 0;
int length;
*out = 0;
if ((length = utf8_charlen(str, str_len)) < 0)
return -1;
switch (length) {
case 1:
uc = str[0];
break;
case 2:
uc = ((str[0] & 0x1F) << 6) + (str[1] & 0x3F);
if (uc < 0x80) uc = -1;
break;
case 3:
uc = ((str[0] & 0x0F) << 12) + ((str[1] & 0x3F) << 6)
+ (str[2] & 0x3F);
if (uc < 0x800 || (uc >= 0xD800 && uc < 0xE000) ||
(uc >= 0xFDD0 && uc < 0xFDF0)) uc = -1;
break;
case 4:
uc = ((str[0] & 0x07) << 18) + ((str[1] & 0x3F) << 12)
+ ((str[2] & 0x3F) << 6) + (str[3] & 0x3F);
if (uc < 0x10000 || uc >= 0x110000) uc = -1;
break;
default:
return -1;
}
if ((uc & 0xFFFF) >= 0xFFFE)
return -1;
*out = uc;
return length;
}
size_t git_utf8_valid_buf_length(const char *_str, size_t str_len)
{
const uint8_t *str = (const uint8_t *)_str;
size_t offset = 0;
while (offset < str_len) {
int length = utf8_charlen(str + offset, str_len - offset);
if (length < 0)
break;
offset += length;
}
return offset;
}
/*
* Copyright (C) the libgit2 contributors. All rights reserved.
*
* This file is part of libgit2, distributed under the GNU GPL v2 with
* a Linking Exception. For full terms see the included COPYING file.
*/
#ifndef INCLUDE_utf8_h__
#define INCLUDE_utf8_h__
#include "common.h"
/*
* Iterate through an UTF-8 string, yielding one codepoint at a time.
*
* @param out pointer where to store the current codepoint
* @param str current position in the string
* @param str_len size left in the string
* @return length in bytes of the read codepoint; -1 if the codepoint was invalid
*/
extern int git_utf8_iterate(uint32_t *out, const char *str, size_t str_len);
/**
* Iterate through an UTF-8 string and stops after finding any invalid UTF-8
* codepoints.
*
* @param str string to scan
* @param str_len size of the string
* @return length in bytes of the string that contains valid data
*/
extern size_t git_utf8_valid_buf_length(const char *str, size_t str_len);
#endif
......@@ -734,123 +734,6 @@ void git__qsort_r(
#endif
}
/*
* git__utf8_iterate is taken from the utf8proc project,
* http://www.public-software-group.org/utf8proc
*
* Copyright (c) 2009 Public Software Group e. V., Berlin, Germany
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the ""Software""),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS IN THE SOFTWARE.
*/
static const int8_t utf8proc_utf8class[256] = {
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
4, 4, 4, 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0
};
static int util_utf8_charlen(const uint8_t *str, size_t str_len)
{
size_t length, i;
length = utf8proc_utf8class[str[0]];
if (!length)
return -1;
if (str_len > 0 && length > str_len)
return -1;
for (i = 1; i < length; i++) {
if ((str[i] & 0xC0) != 0x80)
return -1;
}
return (int)length;
}
int git__utf8_iterate(const uint8_t *str, int str_len, int32_t *dst)
{
int length;
int32_t uc = -1;
*dst = -1;
length = util_utf8_charlen(str, str_len);
if (length < 0)
return -1;
switch (length) {
case 1:
uc = str[0];
break;
case 2:
uc = ((str[0] & 0x1F) << 6) + (str[1] & 0x3F);
if (uc < 0x80) uc = -1;
break;
case 3:
uc = ((str[0] & 0x0F) << 12) + ((str[1] & 0x3F) << 6)
+ (str[2] & 0x3F);
if (uc < 0x800 || (uc >= 0xD800 && uc < 0xE000) ||
(uc >= 0xFDD0 && uc < 0xFDF0)) uc = -1;
break;
case 4:
uc = ((str[0] & 0x07) << 18) + ((str[1] & 0x3F) << 12)
+ ((str[2] & 0x3F) << 6) + (str[3] & 0x3F);
if (uc < 0x10000 || uc >= 0x110000) uc = -1;
break;
}
if (uc < 0 || ((uc & 0xFFFF) >= 0xFFFE))
return -1;
*dst = uc;
return length;
}
size_t git__utf8_valid_buf_length(const uint8_t *str, size_t str_len)
{
size_t offset = 0;
while (offset < str_len) {
int length = util_utf8_charlen(str + offset, str_len - offset);
if (length < 0)
break;
offset += length;
}
return offset;
}
#ifdef GIT_WIN32
int git__getenv(git_buf *out, const char *name)
{
......
......@@ -317,27 +317,6 @@ extern int git__date_rfc2822_fmt(char *out, size_t len, const git_time *date);
extern size_t git__unescape(char *str);
/*
* Iterate through an UTF-8 string, yielding one
* codepoint at a time.
*
* @param str current position in the string
* @param str_len size left in the string; -1 if the string is NULL-terminated
* @param dst pointer where to store the current codepoint
* @return length in bytes of the read codepoint; -1 if the codepoint was invalid
*/
extern int git__utf8_iterate(const uint8_t *str, int str_len, int32_t *dst);
/*
* Iterate through an UTF-8 string and stops after finding any invalid UTF-8
* codepoints.
*
* @param str string to scan
* @param str_len size of the string
* @return length in bytes of the string that contains valid data
*/
extern size_t git__utf8_valid_buf_length(const uint8_t *str, size_t str_len);
/*
* Safely zero-out memory, making sure that the compiler
* doesn't optimize away the operation.
*/
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment