sync URL parsing code - gopherproxy-c - Gopher HTTP proxy in C (CGI)
(HTM) git clone git://git.codemadness.org/gopherproxy-c
(DIR) Log
(DIR) Files
(DIR) Refs
(DIR) README
(DIR) LICENSE
---
(DIR) commit e9b0ad3f6eaef101ec93e70846460f9a4127e129
(DIR) parent ee13891f6be12921f48b361b571de30442b0f87b
(HTM) Author: Hiltjo Posthuma <hiltjo@codemadness.org>
Date: Sat, 19 Mar 2022 11:31:12 +0100
sync URL parsing code
Diffstat:
M gopherproxy.c | 155 ++++++++++++++++++++++++-------
1 file changed, 119 insertions(+), 36 deletions(-)
---
(DIR) diff --git a/gopherproxy.c b/gopherproxy.c
@@ -18,10 +18,15 @@
#define pledge(a,b) 0
#endif
+/* URI */
struct uri {
+ char proto[48]; /* scheme including ":" or "://" */
+ char userinfo[256]; /* username [:password] */
char host[256];
- char port[8];
+ char port[6]; /* numeric port */
char path[1024];
+ char query[1024];
+ char fragment[1024];
};
struct visited {
@@ -447,53 +452,130 @@ checkparam(const char *s)
return 1;
}
+/* check if string has a non-empty scheme / protocol part */
int
-parseuri(const char *str, struct uri *u)
+uri_hasscheme(const char *s)
{
- const char *s, *e;
+ const char *p = s;
- memset(u, 0, sizeof(struct uri));
+ for (; isalpha((unsigned char)*p) || isdigit((unsigned char)*p) ||
+ *p == '+' || *p == '-' || *p == '.'; p++)
+ ;
+ /* scheme, except if empty and starts with ":" then it is a path */
+ return (*p == ':' && p != s);
+}
- s = str;
+int
+uri_parse(const char *s, struct uri *u)
+{
+ const char *p = s;
+ char *endptr;
+ size_t i;
+ long l;
- /* IPv6 */
- if (*s == '[') {
- s++;
- e = strchr(s, ']');
- if (!e || e - s + 1 >= sizeof(u->host))
- return 0;
- memcpy(u->host, s, e - s);
- u->host[e - s] = '\0';
- e++;
+ u->proto[0] = u->userinfo[0] = u->host[0] = u->port[0] = '\0';
+ u->path[0] = u->query[0] = u->fragment[0] = '\0';
+
+ /* protocol-relative */
+ if (*p == '/' && *(p + 1) == '/') {
+ p += 2; /* skip "//" */
+ goto parseauth;
+ }
+
+ /* scheme / protocol part */
+ for (; isalpha((unsigned char)*p) || isdigit((unsigned char)*p) ||
+ *p == '+' || *p == '-' || *p == '.'; p++)
+ ;
+ /* scheme, except if empty and starts with ":" then it is a path */
+ if (*p == ':' && p != s) {
+ if (*(p + 1) == '/' && *(p + 2) == '/')
+ p += 3; /* skip "://" */
+ else
+ p++; /* skip ":" */
+
+ if ((size_t)(p - s) >= sizeof(u->proto))
+ return -1; /* protocol too long */
+ memcpy(u->proto, s, p - s);
+ u->proto[p - s] = '\0';
+
+ if (*(p - 1) != '/')
+ goto parsepath;
} else {
- e = &s[strcspn(s, ":/")];
- if (e - s + 1 >= sizeof(u->host))
- return 0;
- memcpy(u->host, s, e - s);
- u->host[e - s] = '\0';
+ p = s; /* no scheme format, reset to start */
+ goto parsepath;
}
- if (*e == ':') {
- s = e + 1;
- e = &s[strcspn(s, "/")];
+parseauth:
+ /* userinfo (username:password) */
+ i = strcspn(p, "@/?#");
+ if (p[i] == '@') {
+ if (i >= sizeof(u->userinfo))
+ return -1; /* userinfo too long */
+ memcpy(u->userinfo, p, i);
+ u->userinfo[i] = '\0';
+ p += i + 1;
+ }
- if (e - s + 1 >= sizeof(u->port))
- return 0;
- memcpy(u->port, s, e - s);
- u->port[e - s] = '\0';
+ /* IPv6 address */
+ if (*p == '[') {
+ /* bracket not found, host too short or too long */
+ i = strcspn(p, "]");
+ if (p[i] != ']' || i < 3)
+ return -1;
+ i++; /* including "]" */
+ } else {
+ /* domain / host part, skip until port, path or end. */
+ i = strcspn(p, ":/?#");
+ }
+ if (i >= sizeof(u->host))
+ return -1; /* host too long */
+ memcpy(u->host, p, i);
+ u->host[i] = '\0';
+ p += i;
+
+ /* port */
+ if (*p == ':') {
+ p++;
+ if ((i = strcspn(p, "/?#")) >= sizeof(u->port))
+ return -1; /* port too long */
+ memcpy(u->port, p, i);
+ u->port[i] = '\0';
+ /* check for valid port: range 1 - 65535, may be empty */
+ errno = 0;
+ l = strtol(u->port, &endptr, 10);
+ if (i && (errno || *endptr || l <= 0 || l > 65535))
+ return -1;
+ p += i;
}
- if (*e && *e != '/')
- return 0; /* invalid path */
- s = e;
- e = s + strlen(s);
+parsepath:
+ /* path */
+ if ((i = strcspn(p, "?#")) >= sizeof(u->path))
+ return -1; /* path too long */
+ memcpy(u->path, p, i);
+ u->path[i] = '\0';
+ p += i;
+
+ /* query */
+ if (*p == '?') {
+ p++;
+ if ((i = strcspn(p, "#")) >= sizeof(u->query))
+ return -1; /* query too long */
+ memcpy(u->query, p, i);
+ u->query[i] = '\0';
+ p += i;
+ }
- if (e - s + 1 >= sizeof(u->path))
- return 0;
- memcpy(u->path, s, e - s);
- u->path[e - s] = '\0';
+ /* fragment */
+ if (*p == '#') {
+ p++;
+ if ((i = strlen(p)) >= sizeof(u->fragment))
+ return -1; /* fragment too long */
+ memcpy(u->fragment, p, i);
+ u->fragment[i] = '\0';
+ }
- return 1;
+ return 0;
}
int
@@ -527,7 +609,8 @@ main(void)
else
uri = query;
- if (!parseuri(uri, &u))
+ if (!uri_hasscheme(uri) ||
+ uri_parse(uri, &u) == -1)
die(400, "Invalid uri: %s\n", uri);
if (u.host[0] == '\0')
die(400, "Invalid hostname\n");