timprovements - webdump - [FORK] git://git.codemadness.org/webdump
 (HTM) git clone git://git.z3bra.org/webdump.git
 (DIR) Log
 (DIR) Files
 (DIR) Refs
 (DIR) README
 (DIR) LICENSE
       ---
 (DIR) commit 9ac2648a64f0b2d125da2a39ed8e8f4ff2e234b4
 (DIR) parent b708236e10ae2b6af6e62514f2ca159fd6eeeabd
 (HTM) Author: Hiltjo Posthuma <hiltjo@codemadness.org>
       Date:   Sat, 21 Sep 2019 15:23:08 +0200
       
       improvements
       
       - initial url parsing and base href support (WIP).
       - rename xstrdup and xcalloc to estrdup and ecalloc (exits on failure).
       - show links inline, disable printing references at the bottom for now.
       - update TODO.
       
       Diffstat:
         M TODO                                |      11 ++++-------
         M main.c                              |     223 +++++++++++++++++++++++++++++--
       
       2 files changed, 213 insertions(+), 21 deletions(-)
       ---
 (DIR) diff --git a/TODO b/TODO
       t@@ -1,12 +1,9 @@
       +- base href.
       +  specify and parse relative url, allow to specify base and also parse <base href="">
       +- handle whitespace, and tag types properly: atleast: inline-block, inline, block, pre
        - print safe (not certain control chars, except newline, TAB etc).
       -
        - improve/remove duplicate white-space/newlines?
        - cleanup code.
       -
       -===
       -
        - <code> should not be treated as a block (<pre> does?)
       -
       -? xml.c: make sure to always call xmldata handler even if datalen == 0 ?
       -
        - add links as reference, for example on page: http://absmagazin.de/2018 the MP3 urls.
       +? xml.c: make sure to always call xmldata handler even if datalen == 0 ?
 (DIR) diff --git a/main.c b/main.c
       t@@ -1,8 +1,6 @@
       -/* TODO: escape control characters */
       -/* TODO: specify and parse relative url, allow to specify base and also parse <base href=""> ? */
       -
        #include <ctype.h>
        #include <err.h>
       +#include <errno.h>
        #include <stdio.h>
        #include <stdlib.h>
        #include <string.h>
       t@@ -11,11 +9,18 @@
        
        #include "xml.h"
        
       -/* string and size */
       -/*#define STRP(s) s,sizeof(s)-1*/
       -
        static XMLParser parser;
        
       +/* uri */
       +struct uri {
       +        char proto[48];
       +        char host[256];
       +        char path[2048];
       +        char port[6];     /* numeric port */
       +};
       +
       +#if 0
       +/* linked-list of link references */
        struct linkref {
                char *type;
                char *url;
       t@@ -25,6 +30,7 @@ struct linkref {
        static struct linkref *links_head;
        static struct linkref *links_cur;
        static int linkcount;
       +#endif
        
        struct node {
                char tag[256];
       t@@ -42,11 +48,19 @@ typedef struct string {
                size_t  bufsiz; /* allocated size */
        } String;
        
       +int absuri(char *, size_t, const char *, const char *);
       +int parseuri(const char *, struct uri *, int);
       +
       +static char *basehref = "https://codemadness.org";
       +
        static char src[4096]; /* src or href attribute */
        
        #define MAX_DEPTH 256
        static struct node nodes[MAX_DEPTH];
        static int curnode;
       +
       +/* TODO: temporary workaround, handle whitespace, and tag types properly:
       +   atleast: inline-block, inline, block, pre */
        static int ignoredata;
        
        static char *pretags[] = {
       t@@ -154,7 +168,7 @@ string_append(String *s, const char *data, size_t len)
        }
        
        char *
       -xstrdup(const char *s)
       +estrdup(const char *s)
        {
                char *p;
        
       t@@ -164,7 +178,7 @@ xstrdup(const char *s)
        }
        
        void *
       -xcalloc(size_t nmemb, size_t size)
       +ecalloc(size_t nmemb, size_t size)
        {
                void *p;
        
       t@@ -189,6 +203,171 @@ printsafe(const char *s)
                }
        }
        
       +int
       +parseuri(const char *s, struct uri *u, int rel)
       +{
       +        const char *p = s, *b;
       +        char *endptr = NULL;
       +        size_t i;
       +        unsigned long l;
       +
       +        u->proto[0] = u->host[0] = u->path[0] = u->port[0] = '\0';
       +        if (!*s)
       +                return 0;
       +
       +        /* prefix is "//", don't read protocol, skip to domain parsing */
       +        if (!strncmp(p, "//", 2)) {
       +                p += 2; /* skip "//" */
       +        } else {
       +                /* protocol part */
       +                for (p = s; *p && (isalpha((unsigned char)*p) || isdigit((unsigned char)*p) ||
       +                               *p == '+' || *p == '-' || *p == '.'); p++)
       +                        ;
       +                if (!strncmp(p, "://", 3)) {
       +                        if ((size_t)(p - s) >= sizeof(u->proto))
       +                                return -1; /* protocol too long */
       +                        memcpy(u->proto, s, p - s);
       +                        u->proto[p - s] = '\0';
       +                        p += 3; /* skip "://" */
       +                } else {
       +                        p = s; /* no protocol format, set to start */
       +                        /* relative url: read rest as path, else as domain */
       +                        if (rel)
       +                                goto readpath;
       +                }
       +        }
       +        /* IPv6 address */
       +        if (*p == '[') {
       +                /* bracket not found or host too long */
       +                if (!(b = strchr(p, ']')) || (size_t)(b - p) < 3 ||
       +                    (size_t)(b - p) >= sizeof(u->host))
       +                        return -1;
       +                memcpy(u->host, p, b - p + 1);
       +                u->host[b - p + 1] = '\0';
       +                p = b + 1;
       +        } else {
       +                /* domain / host part, skip until port, path or end. */
       +                if ((i = strcspn(p, ":/")) >= sizeof(u->host))
       +                        return -1; /* host too long */
       +                memcpy(u->host, p, i);
       +                u->host[i] = '\0';
       +                p = &p[i];
       +        }
       +        /* port */
       +        if (*p == ':') {
       +                if ((i = strcspn(++p, "/")) >= sizeof(u->port))
       +                        return -1; /* port too long */
       +                memcpy(u->port, p, i);
       +                u->port[i] = '\0';
       +                /* check for valid port: range 1 - 65535 */
       +                errno = 0;
       +                l = strtoul(u->port, &endptr, 10);
       +                if (errno || u->port[0] == '\0' || *endptr ||
       +                    !l || l > 65535)
       +                        return -1;
       +                p = &p[i];
       +        }
       +readpath:
       +        if (u->host[0]) {
       +                p = &p[strspn(p, "/")];
       +                strlcpy(u->path, "/", sizeof(u->path));
       +        } else {
       +                /* absolute uri must have a host specified */
       +                if (!rel)
       +                        return -1;
       +        }
       +        /* treat truncation as an error */
       +        if (strlcat(u->path, p, sizeof(u->path)) >= sizeof(u->path))
       +                return -1;
       +        return 0;
       +}
       +
       +static int
       +encodeuri(char *buf, size_t bufsiz, const char *s)
       +{
       +        static const char *table = "0123456789ABCDEF";
       +        size_t i, b;
       +
       +        for (i = 0, b = 0; s[i]; i++) {
       +                if (s[i] == ' ' ||
       +                    (unsigned char)s[i] > 127 ||
       +                    iscntrl((unsigned char)s[i])) {
       +                        if (b + 3 >= bufsiz)
       +                                return -1;
       +                        buf[b++] = '%';
       +                        buf[b++] = table[((unsigned char)s[i] >> 4) & 15];
       +                        buf[b++] = table[(unsigned char)s[i] & 15];
       +                } else if (b < bufsiz) {
       +                        buf[b++] = s[i];
       +                } else {
       +                        return -1;
       +                }
       +        }
       +        if (b >= bufsiz)
       +                return -1;
       +        buf[b] = '\0';
       +
       +        return 0;
       +}
       +
       +/* Get absolute uri; if `link` is relative use `base` to make it absolute.
       + * the returned string in `buf` is uri encoded, see: encodeuri(). */
       +int
       +absuri(char *buf, size_t bufsiz, const char *link, const char *base)
       +{
       +        struct uri ulink, ubase;
       +        char tmp[4096], *host, *p, *port;
       +        int c, r;
       +        size_t i;
       +
       +        buf[0] = '\0';
       +        if (parseuri(base, &ubase, 0) == -1 ||
       +            parseuri(link, &ulink, 1) == -1 ||
       +            (!ulink.host[0] && !ubase.host[0]))
       +                return -1;
       +
       +        if (!strncmp(link, "//", 2)) {
       +                host = ulink.host;
       +                port = ulink.port;
       +        } else {
       +                host = ulink.host[0] ? ulink.host : ubase.host;
       +                port = ulink.port[0] ? ulink.port : ubase.port;
       +        }
       +        r = snprintf(tmp, sizeof(tmp), "%s://%s%s%s",
       +                ulink.proto[0] ?
       +                        ulink.proto :
       +                        (ubase.proto[0] ? ubase.proto : "http"),
       +                host,
       +                port[0] ? ":" : "",
       +                port);
       +        if (r < 0 || (size_t)r >= sizeof(tmp))
       +                return -1; /* error or truncation */
       +
       +        /* relative to root */
       +        if (!ulink.host[0] && ulink.path[0] != '/') {
       +                /* relative to base url path */
       +                if (ulink.path[0]) {
       +                        if ((p = strrchr(ubase.path, '/'))) {
       +                                /* temporary null-terminate */
       +                                c = *(++p);
       +                                *p = '\0';
       +                                i = strlcat(tmp, ubase.path, sizeof(tmp));
       +                                *p = c; /* restore */
       +                                if (i >= sizeof(tmp))
       +                                        return -1;
       +                        }
       +                } else if (strlcat(tmp, ubase.path, sizeof(tmp)) >=
       +                           sizeof(tmp)) {
       +                        return -1;
       +                }
       +        }
       +        if (strlcat(tmp, ulink.path, sizeof(tmp)) >= sizeof(tmp))
       +                return -1;
       +
       +        return encodeuri(buf, bufsiz, tmp);
       +}
       +
       +
        static void
        xmlcdata(XMLParser *p, const char *data, size_t datalen)
        {
       t@@ -367,7 +546,8 @@ xmltagstartparsed(XMLParser *p, const char *t, size_t tl, int isshort)
        
                cur = &nodes[curnode];
        
       -#if 1
       +#if 0
       +        /* show links as reference at the bottom */
                if (src[0]) {
                        printf(" [%d]", ++linkcount);
                        if (!strcasecmp(t, "img") || !strcasecmp(t, "video") ||
       t@@ -375,15 +555,28 @@ xmltagstartparsed(XMLParser *p, const char *t, size_t tl, int isshort)
                                printf("[%s]", t);
                        /* TODO: check allocation */
                        if (!links_head)
       -                        links_cur = links_head = xcalloc(1, sizeof(*links_head));
       +                        links_cur = links_head = ecalloc(1, sizeof(*links_head));
                        else
       -                        links_cur = links_cur->next = xcalloc(1, sizeof(*links_head));
       -                links_cur->type = xstrdup(t);
       -                links_cur->url = xstrdup(src);
       +                        links_cur = links_cur->next = ecalloc(1, sizeof(*links_head));
       +                links_cur->type = estrdup(t);
       +                links_cur->url = estrdup(src);
                }
                src[0] = '\0';
        #endif
        
       +        /* show links inline */
       +        if (src[0]) {
       +                char absurl[1024];
       +                if (absuri(absurl, sizeof(absurl), src, basehref) != -1) {
       +                        if (!strcasecmp(t, "img") || !strcasecmp(t, "video") ||
       +                            !strcasecmp(t, "audio")) {
       +                                printf("[%s](%s) ", t, absurl);
       +                        } else {
       +                                printf("[%s](%s) ", "link", absurl);
       +                        }
       +                }
       +        }
       +
                if (cur->isblock)
                        fputs("\n", stdout);
        
       t@@ -421,6 +614,7 @@ xmlattr(XMLParser *p, const char *tag, size_t taglen, const char *name,
                        strlcpy(src, value, sizeof(src));
        }
        
       +#if 0
        void
        printlinkrefs(void)
        {
       t@@ -432,6 +626,7 @@ printlinkrefs(void)
                for (i = 1, links_cur = links_head; links_cur; links_cur = links_cur->next, i++)
                        printf("[%zu] - %s (%s)\n", i, links_cur->url, links_cur->type);
        }
       +#endif
        
        int
        main(void)
       t@@ -451,7 +646,7 @@ main(void)
                parser.getnext = getchar;
                xml_parse(&parser);
        
       -        printlinkrefs();
       +/*        printlinkrefs();*/
                putchar('\n');
        
                return 0;