timprovements - webdump - [FORK] git://git.codemadness.org/webdump
(HTM) git clone git://git.z3bra.org/webdump.git
(DIR) Log
(DIR) Files
(DIR) Refs
(DIR) README
(DIR) LICENSE
---
(DIR) commit 9ac2648a64f0b2d125da2a39ed8e8f4ff2e234b4
(DIR) parent b708236e10ae2b6af6e62514f2ca159fd6eeeabd
(HTM) Author: Hiltjo Posthuma <hiltjo@codemadness.org>
Date: Sat, 21 Sep 2019 15:23:08 +0200
improvements
- initial url parsing and base href support (WIP).
- rename xstrdup and xcalloc to estrdup and ecalloc (exits on failure).
- show links inline, disable printing references at the bottom for now.
- update TODO.
Diffstat:
M TODO | 11 ++++-------
M main.c | 223 +++++++++++++++++++++++++++++--
2 files changed, 213 insertions(+), 21 deletions(-)
---
(DIR) diff --git a/TODO b/TODO
t@@ -1,12 +1,9 @@
+- base href.
+ specify and parse relative url, allow to specify base and also parse <base href="">
+- handle whitespace, and tag types properly: atleast: inline-block, inline, block, pre
- print safe (not certain control chars, except newline, TAB etc).
-
- improve/remove duplicate white-space/newlines?
- cleanup code.
-
-===
-
- <code> should not be treated as a block (<pre> does?)
-
-? xml.c: make sure to always call xmldata handler even if datalen == 0 ?
-
- add links as reference, for example on page: http://absmagazin.de/2018 the MP3 urls.
+? xml.c: make sure to always call xmldata handler even if datalen == 0 ?
(DIR) diff --git a/main.c b/main.c
t@@ -1,8 +1,6 @@
-/* TODO: escape control characters */
-/* TODO: specify and parse relative url, allow to specify base and also parse <base href=""> ? */
-
#include <ctype.h>
#include <err.h>
+#include <errno.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
t@@ -11,11 +9,18 @@
#include "xml.h"
-/* string and size */
-/*#define STRP(s) s,sizeof(s)-1*/
-
static XMLParser parser;
+/* uri */
+struct uri {
+ char proto[48];
+ char host[256];
+ char path[2048];
+ char port[6]; /* numeric port */
+};
+
+#if 0
+/* linked-list of link references */
struct linkref {
char *type;
char *url;
t@@ -25,6 +30,7 @@ struct linkref {
static struct linkref *links_head;
static struct linkref *links_cur;
static int linkcount;
+#endif
struct node {
char tag[256];
t@@ -42,11 +48,19 @@ typedef struct string {
size_t bufsiz; /* allocated size */
} String;
+int absuri(char *, size_t, const char *, const char *);
+int parseuri(const char *, struct uri *, int);
+
+static char *basehref = "https://codemadness.org";
+
static char src[4096]; /* src or href attribute */
#define MAX_DEPTH 256
static struct node nodes[MAX_DEPTH];
static int curnode;
+
+/* TODO: temporary workaround, handle whitespace, and tag types properly:
+ atleast: inline-block, inline, block, pre */
static int ignoredata;
static char *pretags[] = {
t@@ -154,7 +168,7 @@ string_append(String *s, const char *data, size_t len)
}
char *
-xstrdup(const char *s)
+estrdup(const char *s)
{
char *p;
t@@ -164,7 +178,7 @@ xstrdup(const char *s)
}
void *
-xcalloc(size_t nmemb, size_t size)
+ecalloc(size_t nmemb, size_t size)
{
void *p;
t@@ -189,6 +203,171 @@ printsafe(const char *s)
}
}
+int
+parseuri(const char *s, struct uri *u, int rel)
+{
+ const char *p = s, *b;
+ char *endptr = NULL;
+ size_t i;
+ unsigned long l;
+
+ u->proto[0] = u->host[0] = u->path[0] = u->port[0] = '\0';
+ if (!*s)
+ return 0;
+
+ /* prefix is "//", don't read protocol, skip to domain parsing */
+ if (!strncmp(p, "//", 2)) {
+ p += 2; /* skip "//" */
+ } else {
+ /* protocol part */
+ for (p = s; *p && (isalpha((unsigned char)*p) || isdigit((unsigned char)*p) ||
+ *p == '+' || *p == '-' || *p == '.'); p++)
+ ;
+ if (!strncmp(p, "://", 3)) {
+ if ((size_t)(p - s) >= sizeof(u->proto))
+ return -1; /* protocol too long */
+ memcpy(u->proto, s, p - s);
+ u->proto[p - s] = '\0';
+ p += 3; /* skip "://" */
+ } else {
+ p = s; /* no protocol format, set to start */
+ /* relative url: read rest as path, else as domain */
+ if (rel)
+ goto readpath;
+ }
+ }
+ /* IPv6 address */
+ if (*p == '[') {
+ /* bracket not found or host too long */
+ if (!(b = strchr(p, ']')) || (size_t)(b - p) < 3 ||
+ (size_t)(b - p) >= sizeof(u->host))
+ return -1;
+ memcpy(u->host, p, b - p + 1);
+ u->host[b - p + 1] = '\0';
+ p = b + 1;
+ } else {
+ /* domain / host part, skip until port, path or end. */
+ if ((i = strcspn(p, ":/")) >= sizeof(u->host))
+ return -1; /* host too long */
+ memcpy(u->host, p, i);
+ u->host[i] = '\0';
+ p = &p[i];
+ }
+ /* port */
+ if (*p == ':') {
+ if ((i = strcspn(++p, "/")) >= sizeof(u->port))
+ return -1; /* port too long */
+ memcpy(u->port, p, i);
+ u->port[i] = '\0';
+ /* check for valid port: range 1 - 65535 */
+ errno = 0;
+ l = strtoul(u->port, &endptr, 10);
+ if (errno || u->port[0] == '\0' || *endptr ||
+ !l || l > 65535)
+ return -1;
+ p = &p[i];
+ }
+readpath:
+ if (u->host[0]) {
+ p = &p[strspn(p, "/")];
+ strlcpy(u->path, "/", sizeof(u->path));
+ } else {
+ /* absolute uri must have a host specified */
+ if (!rel)
+ return -1;
+ }
+ /* treat truncation as an error */
+ if (strlcat(u->path, p, sizeof(u->path)) >= sizeof(u->path))
+ return -1;
+ return 0;
+}
+
+static int
+encodeuri(char *buf, size_t bufsiz, const char *s)
+{
+ static const char *table = "0123456789ABCDEF";
+ size_t i, b;
+
+ for (i = 0, b = 0; s[i]; i++) {
+ if (s[i] == ' ' ||
+ (unsigned char)s[i] > 127 ||
+ iscntrl((unsigned char)s[i])) {
+ if (b + 3 >= bufsiz)
+ return -1;
+ buf[b++] = '%';
+ buf[b++] = table[((unsigned char)s[i] >> 4) & 15];
+ buf[b++] = table[(unsigned char)s[i] & 15];
+ } else if (b < bufsiz) {
+ buf[b++] = s[i];
+ } else {
+ return -1;
+ }
+ }
+ if (b >= bufsiz)
+ return -1;
+ buf[b] = '\0';
+
+ return 0;
+}
+
+/* Get absolute uri; if `link` is relative use `base` to make it absolute.
+ * the returned string in `buf` is uri encoded, see: encodeuri(). */
+int
+absuri(char *buf, size_t bufsiz, const char *link, const char *base)
+{
+ struct uri ulink, ubase;
+ char tmp[4096], *host, *p, *port;
+ int c, r;
+ size_t i;
+
+ buf[0] = '\0';
+ if (parseuri(base, &ubase, 0) == -1 ||
+ parseuri(link, &ulink, 1) == -1 ||
+ (!ulink.host[0] && !ubase.host[0]))
+ return -1;
+
+ if (!strncmp(link, "//", 2)) {
+ host = ulink.host;
+ port = ulink.port;
+ } else {
+ host = ulink.host[0] ? ulink.host : ubase.host;
+ port = ulink.port[0] ? ulink.port : ubase.port;
+ }
+ r = snprintf(tmp, sizeof(tmp), "%s://%s%s%s",
+ ulink.proto[0] ?
+ ulink.proto :
+ (ubase.proto[0] ? ubase.proto : "http"),
+ host,
+ port[0] ? ":" : "",
+ port);
+ if (r < 0 || (size_t)r >= sizeof(tmp))
+ return -1; /* error or truncation */
+
+ /* relative to root */
+ if (!ulink.host[0] && ulink.path[0] != '/') {
+ /* relative to base url path */
+ if (ulink.path[0]) {
+ if ((p = strrchr(ubase.path, '/'))) {
+ /* temporary null-terminate */
+ c = *(++p);
+ *p = '\0';
+ i = strlcat(tmp, ubase.path, sizeof(tmp));
+ *p = c; /* restore */
+ if (i >= sizeof(tmp))
+ return -1;
+ }
+ } else if (strlcat(tmp, ubase.path, sizeof(tmp)) >=
+ sizeof(tmp)) {
+ return -1;
+ }
+ }
+ if (strlcat(tmp, ulink.path, sizeof(tmp)) >= sizeof(tmp))
+ return -1;
+
+ return encodeuri(buf, bufsiz, tmp);
+}
+
+
static void
xmlcdata(XMLParser *p, const char *data, size_t datalen)
{
t@@ -367,7 +546,8 @@ xmltagstartparsed(XMLParser *p, const char *t, size_t tl, int isshort)
cur = &nodes[curnode];
-#if 1
+#if 0
+ /* show links as reference at the bottom */
if (src[0]) {
printf(" [%d]", ++linkcount);
if (!strcasecmp(t, "img") || !strcasecmp(t, "video") ||
t@@ -375,15 +555,28 @@ xmltagstartparsed(XMLParser *p, const char *t, size_t tl, int isshort)
printf("[%s]", t);
/* TODO: check allocation */
if (!links_head)
- links_cur = links_head = xcalloc(1, sizeof(*links_head));
+ links_cur = links_head = ecalloc(1, sizeof(*links_head));
else
- links_cur = links_cur->next = xcalloc(1, sizeof(*links_head));
- links_cur->type = xstrdup(t);
- links_cur->url = xstrdup(src);
+ links_cur = links_cur->next = ecalloc(1, sizeof(*links_head));
+ links_cur->type = estrdup(t);
+ links_cur->url = estrdup(src);
}
src[0] = '\0';
#endif
+ /* show links inline */
+ if (src[0]) {
+ char absurl[1024];
+ if (absuri(absurl, sizeof(absurl), src, basehref) != -1) {
+ if (!strcasecmp(t, "img") || !strcasecmp(t, "video") ||
+ !strcasecmp(t, "audio")) {
+ printf("[%s](%s) ", t, absurl);
+ } else {
+ printf("[%s](%s) ", "link", absurl);
+ }
+ }
+ }
+
if (cur->isblock)
fputs("\n", stdout);
t@@ -421,6 +614,7 @@ xmlattr(XMLParser *p, const char *tag, size_t taglen, const char *name,
strlcpy(src, value, sizeof(src));
}
+#if 0
void
printlinkrefs(void)
{
t@@ -432,6 +626,7 @@ printlinkrefs(void)
for (i = 1, links_cur = links_head; links_cur; links_cur = links_cur->next, i++)
printf("[%zu] - %s (%s)\n", i, links_cur->url, links_cur->type);
}
+#endif
int
main(void)
t@@ -451,7 +646,7 @@ main(void)
parser.getnext = getchar;
xml_parse(&parser);
- printlinkrefs();
+/* printlinkrefs();*/
putchar('\n');
return 0;