Commit 8c2c0fa8 by Edward Thomson

net: refactor url parsing

Refactor url parsing to simplify the state-passing (introducing a
struct) and add a path parser for future reusability.
parent a79aa142
...@@ -19,6 +19,30 @@ ...@@ -19,6 +19,30 @@
#define DEFAULT_PORT_GIT "9418" #define DEFAULT_PORT_GIT "9418"
#define DEFAULT_PORT_SSH "22" #define DEFAULT_PORT_SSH "22"
#define GIT_NET_URL_PARSER_INIT { 0 }
typedef struct {
int hierarchical : 1;
const char *scheme;
const char *user;
const char *password;
const char *host;
const char *port;
const char *path;
const char *query;
const char *fragment;
size_t scheme_len;
size_t user_len;
size_t password_len;
size_t host_len;
size_t port_len;
size_t path_len;
size_t query_len;
size_t fragment_len;
} git_net_url_parser;
bool git_net_hostname_matches_cert( bool git_net_hostname_matches_cert(
const char *hostname, const char *hostname,
const char *pattern) const char *pattern)
...@@ -63,6 +87,12 @@ bool git_net_hostname_matches_cert( ...@@ -63,6 +87,12 @@ bool git_net_hostname_matches_cert(
return false; return false;
} }
#define is_valid_scheme_char(c) \
(((c) >= 'a' && (c) <= 'z') || \
((c) >= 'A' && (c) <= 'Z') || \
((c) >= '0' && (c) <= '9') || \
(c) == '+' || (c) == '-' || (c) == '.')
bool git_net_str_is_url(const char *str) bool git_net_str_is_url(const char *str)
{ {
const char *c; const char *c;
...@@ -71,10 +101,7 @@ bool git_net_str_is_url(const char *str) ...@@ -71,10 +101,7 @@ bool git_net_str_is_url(const char *str)
if (*c == ':' && *(c+1) == '/' && *(c+2) == '/') if (*c == ':' && *(c+1) == '/' && *(c+2) == '/')
return true; return true;
if ((*c < 'a' || *c > 'z') && if (!is_valid_scheme_char(*c))
(*c < 'A' || *c > 'Z') &&
(*c < '0' || *c > '9') &&
(*c != '+' && *c != '-' && *c != '.'))
break; break;
} }
...@@ -97,6 +124,16 @@ static const char *default_port_for_scheme(const char *scheme) ...@@ -97,6 +124,16 @@ static const char *default_port_for_scheme(const char *scheme)
return NULL; return NULL;
} }
static bool is_ssh_scheme(const char *scheme, size_t scheme_len)
{
if (!scheme_len)
return false;
return strncasecmp(scheme, "ssh", scheme_len) == 0 ||
strncasecmp(scheme, "ssh+git", scheme_len) == 0 ||
strncasecmp(scheme, "git+ssh", scheme_len) == 0;
}
int git_net_url_dup(git_net_url *out, git_net_url *in) int git_net_url_dup(git_net_url *out, git_net_url *in)
{ {
if (in->scheme) { if (in->scheme) {
...@@ -144,12 +181,9 @@ static int url_invalid(const char *message) ...@@ -144,12 +181,9 @@ static int url_invalid(const char *message)
} }
static int url_parse_authority( static int url_parse_authority(
const char **user_start, size_t *user_len, git_net_url_parser *parser,
const char **password_start, size_t *password_len, const char *authority,
const char **host_start, size_t *host_len, size_t len)
const char **port_start, size_t *port_len,
const char *authority_start, size_t len,
const char *scheme_start, size_t scheme_len)
{ {
const char *c, *hostport_end, *host_end = NULL, const char *c, *hostport_end, *host_end = NULL,
*userpass_end, *user_end = NULL; *userpass_end, *user_end = NULL;
...@@ -165,14 +199,14 @@ static int url_parse_authority( ...@@ -165,14 +199,14 @@ static int url_parse_authority(
* walk the authority backwards so that we can parse google code's * walk the authority backwards so that we can parse google code's
* ssh urls that are not rfc compliant and allow @ in the username * ssh urls that are not rfc compliant and allow @ in the username
*/ */
for (hostport_end = authority_start + len, c = hostport_end - 1; for (hostport_end = authority + len, c = hostport_end - 1;
c >= authority_start && !user_end; c >= authority && !user_end;
c--) { c--) {
switch (state) { switch (state) {
case HOSTPORT: case HOSTPORT:
if (*c == ':') { if (*c == ':') {
*port_start = c + 1; parser->port = c + 1;
*port_len = hostport_end - *port_start; parser->port_len = hostport_end - parser->port;
host_end = c; host_end = c;
state = HOST; state = HOST;
break; break;
...@@ -200,9 +234,10 @@ static int url_parse_authority( ...@@ -200,9 +234,10 @@ static int url_parse_authority(
} }
else if (*c == '@') { else if (*c == '@') {
*host_start = c + 1; parser->host = c + 1;
*host_len = host_end ? host_end - *host_start : parser->host_len = host_end ?
hostport_end - *host_start; host_end - parser->host :
hostport_end - parser->host;
userpass_end = c; userpass_end = c;
state = USERPASS; state = USERPASS;
} }
...@@ -215,8 +250,8 @@ static int url_parse_authority( ...@@ -215,8 +250,8 @@ static int url_parse_authority(
case IPV6: case IPV6:
if (*c == '[') { if (*c == '[') {
*host_start = c + 1; parser->host = c + 1;
*host_len = host_end - *host_start; parser->host_len = host_end - parser->host;
state = HOST_END; state = HOST_END;
} }
...@@ -240,12 +275,12 @@ static int url_parse_authority( ...@@ -240,12 +275,12 @@ static int url_parse_authority(
case USERPASS: case USERPASS:
if (*c == '@' && if (*c == '@' &&
strncasecmp(scheme_start, "ssh", scheme_len)) !is_ssh_scheme(parser->scheme, parser->scheme_len))
return url_invalid("malformed hostname"); return url_invalid("malformed hostname");
if (*c == ':') { if (*c == ':') {
*password_start = c + 1; parser->password = c + 1;
*password_len = userpass_end - *password_start; parser->password_len = userpass_end - parser->password;
user_end = c; user_end = c;
state = USER; state = USER;
break; break;
...@@ -260,24 +295,24 @@ static int url_parse_authority( ...@@ -260,24 +295,24 @@ static int url_parse_authority(
switch (state) { switch (state) {
case HOSTPORT: case HOSTPORT:
*host_start = authority_start; parser->host = authority;
*host_len = (hostport_end - *host_start); parser->host_len = (hostport_end - parser->host);
break; break;
case HOST: case HOST:
*host_start = authority_start; parser->host = authority;
*host_len = (host_end - *host_start); parser->host_len = (host_end - parser->host);
break; break;
case IPV6: case IPV6:
return url_invalid("malformed hostname"); return url_invalid("malformed hostname");
case HOST_END: case HOST_END:
break; break;
case USERPASS: case USERPASS:
*user_start = authority_start; parser->user = authority;
*user_len = (userpass_end - *user_start); parser->user_len = (userpass_end - parser->user);
break; break;
case USER: case USER:
*user_start = authority_start; parser->user = authority;
*user_len = (user_end - *user_start); parser->user_len = (user_end - parser->user);
break; break;
default: default:
GIT_ASSERT(!"unhandled state"); GIT_ASSERT(!"unhandled state");
...@@ -286,97 +321,30 @@ static int url_parse_authority( ...@@ -286,97 +321,30 @@ static int url_parse_authority(
return 0; return 0;
} }
int git_net_url_parse(git_net_url *url, const char *given) static int url_parse_path(
git_net_url_parser *parser,
const char *path,
size_t len)
{ {
const char *c, *scheme_start, *authority_start, *user_start, const char *c, *end;
*password_start, *host_start, *port_start, *path_start,
*query_start, *fragment_start, *default_port;
git_str scheme = GIT_STR_INIT, user = GIT_STR_INIT,
password = GIT_STR_INIT, host = GIT_STR_INIT,
port = GIT_STR_INIT, path = GIT_STR_INIT,
query = GIT_STR_INIT, fragment = GIT_STR_INIT;
size_t scheme_len = 0, user_len = 0, password_len = 0, host_len = 0,
port_len = 0, path_len = 0, query_len = 0, fragment_len = 0;
bool hierarchical = false;
int error = 0;
enum { enum { PATH, QUERY, FRAGMENT } state = PATH;
SCHEME,
AUTHORITY_START, AUTHORITY,
PATH_START, PATH,
QUERY,
FRAGMENT
} state = SCHEME;
memset(url, 0, sizeof(git_net_url)); parser->path = path;
end = path + len;
for (c = scheme_start = given; *c; c++) { for (c = path; c < end; c++) {
switch (state) { switch (state) {
case SCHEME:
if (*c == ':') {
scheme_len = (c - scheme_start);
if (*(c+1) == '/' && *(c+2) == '/') {
c += 2;
hierarchical = true;
state = AUTHORITY_START;
} else {
state = PATH_START;
}
} else if ((*c < 'A' || *c > 'Z') &&
(*c < 'a' || *c > 'z') &&
(*c < '0' || *c > '9') &&
(*c != '+' && *c != '-' && *c != '.')) {
/*
* an illegal scheme character means that we
* were just given a relative path
*/
path_start = given;
state = PATH;
break;
}
break;
case AUTHORITY_START:
authority_start = c;
state = AUTHORITY;
/* fall through */
case AUTHORITY:
if (*c != '/')
break;
/*
* authority is sufficiently complex that we parse
* it separately
*/
if ((error = url_parse_authority(
&user_start, &user_len,
&password_start,&password_len,
&host_start, &host_len,
&port_start, &port_len,
authority_start, (c - authority_start),
scheme_start, scheme_len)) < 0)
goto done;
/* fall through */
case PATH_START:
path_start = c;
state = PATH;
/* fall through */
case PATH: case PATH:
switch (*c) { switch (*c) {
case '?': case '?':
path_len = (c - path_start); parser->path_len = (c - parser->path);
query_start = c + 1; parser->query = c + 1;
state = QUERY; state = QUERY;
break; break;
case '#': case '#':
path_len = (c - path_start); parser->path_len = (c - parser->path);
fragment_start = c + 1; parser->fragment = c + 1;
state = FRAGMENT; state = FRAGMENT;
break; break;
} }
...@@ -384,8 +352,8 @@ int git_net_url_parse(git_net_url *url, const char *given) ...@@ -384,8 +352,8 @@ int git_net_url_parse(git_net_url *url, const char *given)
case QUERY: case QUERY:
if (*c == '#') { if (*c == '#') {
query_len = (c - query_start); parser->query_len = (c - parser->query);
fragment_start = c + 1; parser->fragment = c + 1;
state = FRAGMENT; state = FRAGMENT;
} }
break; break;
...@@ -399,82 +367,70 @@ int git_net_url_parse(git_net_url *url, const char *given) ...@@ -399,82 +367,70 @@ int git_net_url_parse(git_net_url *url, const char *given)
} }
switch (state) { switch (state) {
case SCHEME:
/*
* if we never saw a ':' then we were given a relative
* path, not a bare scheme
*/
path_start = given;
path_len = (c - scheme_start);
break;
case AUTHORITY_START:
break;
case AUTHORITY:
if ((error = url_parse_authority(
&user_start, &user_len,
&password_start,&password_len,
&host_start, &host_len,
&port_start, &port_len,
authority_start, (c - authority_start),
scheme_start, scheme_len)) < 0)
goto done;
break;
case PATH_START:
break;
case PATH: case PATH:
path_len = (c - path_start); parser->path_len = (c - parser->path);
break; break;
case QUERY: case QUERY:
query_len = (c - query_start); parser->query_len = (c - parser->query);
break; break;
case FRAGMENT: case FRAGMENT:
fragment_len = (c - fragment_start); parser->fragment_len = (c - parser->fragment);
break; break;
default:
GIT_ASSERT(!"unhandled state");
} }
if (scheme_len) { return 0;
if ((error = git_str_put(&scheme, scheme_start, scheme_len)) < 0) }
static int url_parse_finalize(git_net_url *url, git_net_url_parser *parser)
{
git_str scheme = GIT_STR_INIT, user = GIT_STR_INIT,
password = GIT_STR_INIT, host = GIT_STR_INIT,
port = GIT_STR_INIT, path = GIT_STR_INIT,
query = GIT_STR_INIT, fragment = GIT_STR_INIT;
const char *default_port;
int error = 0;
if (parser->scheme_len) {
if ((error = git_str_put(&scheme, parser->scheme, parser->scheme_len)) < 0)
goto done; goto done;
git__strntolower(scheme.ptr, scheme.size); git__strntolower(scheme.ptr, scheme.size);
} }
if (user_len && if (parser->user_len &&
(error = git_str_decode_percent(&user, user_start, user_len)) < 0) (error = git_str_decode_percent(&user, parser->user, parser->user_len)) < 0)
goto done; goto done;
if (password_len && if (parser->password_len &&
(error = git_str_decode_percent(&password, password_start, password_len)) < 0) (error = git_str_decode_percent(&password, parser->password, parser->password_len)) < 0)
goto done; goto done;
if (host_len && if (parser->host_len &&
(error = git_str_decode_percent(&host, host_start, host_len)) < 0) (error = git_str_decode_percent(&host, parser->host, parser->host_len)) < 0)
goto done; goto done;
if (port_len) if (parser->port_len)
error = git_str_put(&port, port_start, port_len); error = git_str_put(&port, parser->port, parser->port_len);
else if (scheme_len && (default_port = default_port_for_scheme(scheme.ptr)) != NULL) else if (parser->scheme_len && (default_port = default_port_for_scheme(scheme.ptr)) != NULL)
error = git_str_puts(&port, default_port); error = git_str_puts(&port, default_port);
if (error < 0) if (error < 0)
goto done; goto done;
if (path_len) if (parser->path_len)
error = git_str_put(&path, path_start, path_len); error = git_str_put(&path, parser->path, parser->path_len);
else if (hierarchical) else if (parser->hierarchical)
error = git_str_puts(&path, "/"); error = git_str_puts(&path, "/");
if (error < 0) if (error < 0)
goto done; goto done;
if (query_len && if (parser->query_len &&
(error = git_str_decode_percent(&query, query_start, query_len)) < 0) (error = git_str_decode_percent(&query, parser->query, parser->query_len)) < 0)
goto done; goto done;
if (fragment_len && if (parser->fragment_len &&
(error = git_str_decode_percent(&fragment, fragment_start, fragment_len)) < 0) (error = git_str_decode_percent(&fragment, parser->fragment, parser->fragment_len)) < 0)
goto done; goto done;
url->scheme = git_str_detach(&scheme); url->scheme = git_str_detach(&scheme);
...@@ -501,6 +457,114 @@ done: ...@@ -501,6 +457,114 @@ done:
return error; return error;
} }
int git_net_url_parse(git_net_url *url, const char *given)
{
git_net_url_parser parser = GIT_NET_URL_PARSER_INIT;
const char *c, *authority, *path;
size_t authority_len = 0, path_len = 0;
int error = 0;
enum {
SCHEME_START, SCHEME,
AUTHORITY_START, AUTHORITY,
PATH_START, PATH
} state = SCHEME_START;
memset(url, 0, sizeof(git_net_url));
for (c = given; *c; c++) {
switch (state) {
case SCHEME_START:
parser.scheme = c;
state = SCHEME;
/* fall through */
case SCHEME:
if (*c == ':') {
parser.scheme_len = (c - parser.scheme);
if (parser.scheme_len &&
*(c+1) == '/' && *(c+2) == '/') {
c += 2;
parser.hierarchical = 1;
state = AUTHORITY_START;
} else {
state = PATH_START;
}
} else if (!is_valid_scheme_char(*c)) {
/*
* an illegal scheme character means that we
* were just given a relative path
*/
path = given;
state = PATH;
break;
}
break;
case AUTHORITY_START:
authority = c;
state = AUTHORITY;
/* fall through */
case AUTHORITY:
if (*c != '/')
break;
authority_len = (c - authority);
/* fall through */
case PATH_START:
path = c;
state = PATH;
break;
case PATH:
break;
default:
GIT_ASSERT(!"unhandled state");
}
}
switch (state) {
case SCHEME:
/*
* if we never saw a ':' then we were given a relative
* path, not a bare scheme
*/
path = given;
path_len = (c - path);
break;
case AUTHORITY_START:
break;
case AUTHORITY:
authority_len = (c - authority);
break;
case PATH_START:
break;
case PATH:
path_len = (c - path);
break;
default:
GIT_ASSERT(!"unhandled state");
}
if (authority_len &&
(error = url_parse_authority(&parser, authority, authority_len)) < 0)
goto done;
if (path_len &&
(error = url_parse_path(&parser, path, path_len)) < 0)
goto done;
error = url_parse_finalize(url, &parser);
done:
return error;
}
static int scp_invalid(const char *message) static int scp_invalid(const char *message)
{ {
git_error_set(GIT_ERROR_NET, "invalid scp-style path: %s", message); git_error_set(GIT_ERROR_NET, "invalid scp-style path: %s", message);
......
...@@ -669,6 +669,20 @@ void test_url_parse__ipv6_invalid_addresses(void) ...@@ -669,6 +669,20 @@ void test_url_parse__ipv6_invalid_addresses(void)
/* Oddities */ /* Oddities */
void test_url_parse__empty_scheme(void)
{
cl_git_pass(git_net_url_parse(&conndata, "://example.com/resource"));
cl_assert_equal_s(conndata.scheme, NULL);
cl_assert_equal_s(conndata.host, NULL);
cl_assert_equal_s(conndata.port, NULL);
cl_assert_equal_s(conndata.path, "//example.com/resource");
cl_assert_equal_p(conndata.username, NULL);
cl_assert_equal_p(conndata.password, NULL);
cl_assert_equal_p(conndata.query, NULL);
cl_assert_equal_p(conndata.fragment, NULL);
cl_assert_equal_i(git_net_url_is_default_port(&conndata), 0);
}
void test_url_parse__invalid_scheme_is_relative(void) void test_url_parse__invalid_scheme_is_relative(void)
{ {
cl_git_pass(git_net_url_parse(&conndata, "foo!bar://host:42/path/to/project?query_string=yes")); cl_git_pass(git_net_url_parse(&conndata, "foo!bar://host:42/path/to/project?query_string=yes"));
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment