trename main.c to webdump.c - webdump - [FORK] git://git.codemadness.org/webdump
 (HTM) git clone git://git.z3bra.org/webdump.git
 (DIR) Log
 (DIR) Files
 (DIR) Refs
 (DIR) README
 (DIR) LICENSE
       ---
 (DIR) commit b82529ac7152b6326161c23b267d7719090ba168
 (DIR) parent f3f8b7d8e8f4b72c072488b524cfd0b08791fdb4
 (HTM) Author: Hiltjo Posthuma <hiltjo@codemadness.org>
       Date:   Sun, 22 Sep 2019 19:14:41 +0200
       
       rename main.c to webdump.c
       
       Diffstat:
         D main.c                              |     697 ------------------------------
         A webdump.c                           |     706 +++++++++++++++++++++++++++++++
       
       2 files changed, 706 insertions(+), 697 deletions(-)
       ---
 (DIR) diff --git a/main.c b/main.c
       t@@ -1,697 +0,0 @@
       -#include <ctype.h>
       -#include <err.h>
       -#include <errno.h>
       -#include <stdio.h>
       -#include <stdlib.h>
       -#include <string.h>
       -#include <strings.h>
       -#include <unistd.h>
       -
       -#include "xml.h"
       -
       -static XMLParser parser;
       -
       -/* uri */
       -struct uri {
       -        char proto[48];
       -        char host[256];
       -        char path[2048];
       -        char port[6];     /* numeric port */
       -};
       -
       -static int termwidth = 72;
       -
       -#if 0
       -/* linked-list of link references */
       -struct linkref {
       -        char *type;
       -        char *url;
       -        struct linkref *next;
       -};
       -
       -static struct linkref *links_head;
       -static struct linkref *links_cur;
       -static int linkcount;
       -#endif
       -
       -enum DisplayType {
       -        DisplayUnknown     = 0,
       -        DisplayNone        = 1,
       -        DisplayPre         = 2,
       -        DisplayInline      = 4,
       -        DisplayInlineBlock = 8,
       -        DisplayBlock       = 16,
       -        DisplayList        = 32,
       -        DisplayListItem    = 64,
       -        DisplayTable       = 128,
       -        DisplayTableRow    = 256,
       -        DisplayTableCell   = 512,
       -        DisplayHeader      = 1024,
       -};
       -
       -struct node {
       -        char tag[256];
       -        enum DisplayType displaytype;
       -};
       -
       -typedef struct node Node;
       -
       -/* String data / memory pool */
       -typedef struct string {
       -        char   *data;   /* data */
       -        size_t  len;    /* string length */
       -        size_t  bufsiz; /* allocated size */
       -} String;
       -
       -int absuri(char *, size_t, const char *, const char *);
       -int parseuri(const char *, struct uri *, int);
       -
       -static char *basehref = "https://codemadness.org";
       -
       -static char src[4096]; /* src or href attribute */
       -
       -#define MAX_DEPTH 256
       -static struct node nodes[MAX_DEPTH];
       -static int curnode;
       -
       -static struct {
       -        char *tag;
       -        enum DisplayType displaytype;
       -} tags[] = {
       -        /* pre */
       -        { "pre", DisplayPre },
       -        { "code", DisplayPre },
       -        /* inline */
       -#if 0
       -        { "b", DisplayInline },
       -        { "i", DisplayInline },
       -        { "u", DisplayInline },
       -        { "strong", DisplayInline },
       -        { "em", DisplayInline },
       -        { "a", DisplayInline },
       -        { "span", DisplayInline },
       -        { "img", DisplayInline },
       -        { "label", DisplayInline },
       -#endif
       -        /* table */
       -        { "table", DisplayTable },
       -        /* table-row */
       -        { "tr", DisplayTableRow },
       -        /* table-cell */
       -        { "td", DisplayTableCell },
       -        { "th", DisplayTableCell },
       -        /* list-item */
       -        { "li", DisplayListItem },
       -        /* header */
       -        { "h1", DisplayHeader },
       -        { "h2", DisplayHeader },
       -        { "h3", DisplayHeader },
       -        { "h4", DisplayHeader },
       -        { "h5", DisplayHeader },
       -        { "h6", DisplayHeader },
       -        /* break */
       -        { "br", 0 },
       -        /* list */
       -        { "ul", DisplayList },
       -        { "ol", DisplayList },
       -        /* block */
       -        { "p", DisplayBlock },
       -        { "blockquote", DisplayBlock },
       -        { "hr", DisplayBlock },
       -        { "title", DisplayBlock },
       -        { "nav", DisplayBlock },
       -        { "main", DisplayBlock },
       -        { "article", DisplayBlock },
       -        { "header", DisplayBlock },
       -        { "footer", DisplayBlock },
       -        { "div", DisplayBlock },
       -};
       -
       -static String htmldata;
       -
       -static const char *ignorestate, *endtag;
       -static int (*getnext)(void);
       -
       -/* return a space for all data until some case-insensitive string occurs. This
       -   is used to parse incorrect HTML/XML that contains unescaped HTML in script
       -   or style tags. If you see some </script> tag in a CDATA or comment
       -   section then e-mail W3C and tell them the web is too complex. */
       -static inline int
       -getnext_ignore(void)
       -{
       -        int c;
       -
       -        if ((c = getnext()) == EOF)
       -                return EOF;
       -
       -        if (tolower(c) == tolower((unsigned char)*ignorestate)) {
       -                ignorestate++;
       -                if (*ignorestate == '\0') {
       -                        parser.getnext = getnext; /* restore */
       -                        return c;
       -                }
       -        } else {
       -                ignorestate = endtag;
       -        }
       -
       -        return ' ';
       -}
       -
       -/* Clear string only; don't free, prevents unnecessary reallocation. */
       -static void
       -string_clear(String *s)
       -{
       -        if (s->data)
       -                s->data[0] = '\0';
       -        s->len = 0;
       -}
       -
       -static void
       -string_buffer_realloc(String *s, size_t newlen)
       -{
       -        size_t alloclen;
       -
       -        for (alloclen = 64; alloclen <= newlen; alloclen *= 2)
       -                ;
       -        if (!(s->data = realloc(s->data, alloclen)))
       -                err(1, "realloc");
       -        s->bufsiz = alloclen;
       -}
       -
       -static void
       -string_append(String *s, const char *data, size_t len)
       -{
       -        if (!len)
       -                return;
       -        /* check if allocation is necesary, don't shrink buffer,
       -         * should be more than bufsiz ofcourse. */
       -        if (s->len + len >= s->bufsiz)
       -                string_buffer_realloc(s, s->len + len + 1);
       -        memcpy(s->data + s->len, data, len);
       -        s->len += len;
       -        s->data[s->len] = '\0';
       -}
       -
       -char *
       -estrdup(const char *s)
       -{
       -        char *p;
       -
       -        if (!(p = strdup(s)))
       -                err(1, "strdup");
       -        return p;
       -}
       -
       -void *
       -ecalloc(size_t nmemb, size_t size)
       -{
       -        void *p;
       -
       -        if (!(p = calloc(nmemb, size)))
       -                err(1, "calloc");
       -        return p;
       -}
       -
       -static void
       -printsafe(const char *s)
       -{
       -        for (; *s; s++) {
       -                switch (*s) {
       -                case '\t':
       -                case '\n':
       -                        putchar(*s);
       -                        break;
       -                default:
       -                        if (!iscntrl((unsigned char)*s))
       -                                putchar(*s);
       -                }
       -        }
       -}
       -
       -int
       -parseuri(const char *s, struct uri *u, int rel)
       -{
       -        const char *p = s, *b;
       -        char *endptr = NULL;
       -        size_t i;
       -        unsigned long l;
       -
       -        u->proto[0] = u->host[0] = u->path[0] = u->port[0] = '\0';
       -        if (!*s)
       -                return 0;
       -
       -        /* prefix is "//", don't read protocol, skip to domain parsing */
       -        if (!strncmp(p, "//", 2)) {
       -                p += 2; /* skip "//" */
       -        } else {
       -                /* protocol part */
       -                for (p = s; *p && (isalpha((unsigned char)*p) || isdigit((unsigned char)*p) ||
       -                               *p == '+' || *p == '-' || *p == '.'); p++)
       -                        ;
       -                if (!strncmp(p, "://", 3)) {
       -                        if ((size_t)(p - s) >= sizeof(u->proto))
       -                                return -1; /* protocol too long */
       -                        memcpy(u->proto, s, p - s);
       -                        u->proto[p - s] = '\0';
       -                        p += 3; /* skip "://" */
       -                } else {
       -                        p = s; /* no protocol format, set to start */
       -                        /* relative url: read rest as path, else as domain */
       -                        if (rel)
       -                                goto readpath;
       -                }
       -        }
       -        /* IPv6 address */
       -        if (*p == '[') {
       -                /* bracket not found or host too long */
       -                if (!(b = strchr(p, ']')) || (size_t)(b - p) < 3 ||
       -                    (size_t)(b - p) >= sizeof(u->host))
       -                        return -1;
       -                memcpy(u->host, p, b - p + 1);
       -                u->host[b - p + 1] = '\0';
       -                p = b + 1;
       -        } else {
       -                /* domain / host part, skip until port, path or end. */
       -                if ((i = strcspn(p, ":/")) >= sizeof(u->host))
       -                        return -1; /* host too long */
       -                memcpy(u->host, p, i);
       -                u->host[i] = '\0';
       -                p = &p[i];
       -        }
       -        /* port */
       -        if (*p == ':') {
       -                if ((i = strcspn(++p, "/")) >= sizeof(u->port))
       -                        return -1; /* port too long */
       -                memcpy(u->port, p, i);
       -                u->port[i] = '\0';
       -                /* check for valid port: range 1 - 65535 */
       -                errno = 0;
       -                l = strtoul(u->port, &endptr, 10);
       -                if (errno || u->port[0] == '\0' || *endptr ||
       -                    !l || l > 65535)
       -                        return -1;
       -                p = &p[i];
       -        }
       -readpath:
       -        if (u->host[0]) {
       -                p = &p[strspn(p, "/")];
       -                strlcpy(u->path, "/", sizeof(u->path));
       -        } else {
       -                /* absolute uri must have a host specified */
       -                if (!rel)
       -                        return -1;
       -        }
       -        /* treat truncation as an error */
       -        if (strlcat(u->path, p, sizeof(u->path)) >= sizeof(u->path))
       -                return -1;
       -        return 0;
       -}
       -
       -static int
       -encodeuri(char *buf, size_t bufsiz, const char *s)
       -{
       -        static const char *table = "0123456789ABCDEF";
       -        size_t i, b;
       -
       -        for (i = 0, b = 0; s[i]; i++) {
       -                if (s[i] == ' ' ||
       -                    (unsigned char)s[i] > 127 ||
       -                    iscntrl((unsigned char)s[i])) {
       -                        if (b + 3 >= bufsiz)
       -                                return -1;
       -                        buf[b++] = '%';
       -                        buf[b++] = table[((unsigned char)s[i] >> 4) & 15];
       -                        buf[b++] = table[(unsigned char)s[i] & 15];
       -                } else if (b < bufsiz) {
       -                        buf[b++] = s[i];
       -                } else {
       -                        return -1;
       -                }
       -        }
       -        if (b >= bufsiz)
       -                return -1;
       -        buf[b] = '\0';
       -
       -        return 0;
       -}
       -
       -/* Get absolute uri; if `link` is relative use `base` to make it absolute.
       - * the returned string in `buf` is uri encoded, see: encodeuri(). */
       -int
       -absuri(char *buf, size_t bufsiz, const char *link, const char *base)
       -{
       -        struct uri ulink, ubase;
       -        char tmp[4096], *host, *p, *port;
       -        int c, r;
       -        size_t i;
       -
       -        buf[0] = '\0';
       -        if (parseuri(base, &ubase, 0) == -1 ||
       -            parseuri(link, &ulink, 1) == -1 ||
       -            (!ulink.host[0] && !ubase.host[0]))
       -                return -1;
       -
       -        if (!strncmp(link, "//", 2)) {
       -                host = ulink.host;
       -                port = ulink.port;
       -        } else {
       -                host = ulink.host[0] ? ulink.host : ubase.host;
       -                port = ulink.port[0] ? ulink.port : ubase.port;
       -        }
       -        r = snprintf(tmp, sizeof(tmp), "%s://%s%s%s",
       -                ulink.proto[0] ?
       -                        ulink.proto :
       -                        (ubase.proto[0] ? ubase.proto : "http"),
       -                host,
       -                port[0] ? ":" : "",
       -                port);
       -        if (r < 0 || (size_t)r >= sizeof(tmp))
       -                return -1; /* error or truncation */
       -
       -        /* relative to root */
       -        if (!ulink.host[0] && ulink.path[0] != '/') {
       -                /* relative to base url path */
       -                if (ulink.path[0]) {
       -                        if ((p = strrchr(ubase.path, '/'))) {
       -                                /* temporary null-terminate */
       -                                c = *(++p);
       -                                *p = '\0';
       -                                i = strlcat(tmp, ubase.path, sizeof(tmp));
       -                                *p = c; /* restore */
       -                                if (i >= sizeof(tmp))
       -                                        return -1;
       -                        }
       -                } else if (strlcat(tmp, ubase.path, sizeof(tmp)) >=
       -                           sizeof(tmp)) {
       -                        return -1;
       -                }
       -        }
       -        if (strlcat(tmp, ulink.path, sizeof(tmp)) >= sizeof(tmp))
       -                return -1;
       -
       -        return encodeuri(buf, bufsiz, tmp);
       -}
       -
       -static void
       -xmlcdata(XMLParser *p, const char *data, size_t datalen)
       -{
       -        struct node *cur;
       -
       -        cur = &nodes[curnode];
       -        if (cur->displaytype & DisplayNone)
       -                return;
       -
       -        printsafe(data);
       -}
       -
       -#if 0
       -static void
       -xmldatastart(XMLParser *p)
       -{
       -//        printf("DEBUG: %s\n", __func__);
       -}
       -#endif
       -
       -static void
       -xmldataend(XMLParser *p)
       -{
       -        struct node *cur;
       -        char *start, *s, *e;
       -
       -//        printf("DEBUG: %s\n", __func__);
       -
       -        if (!htmldata.data || !htmldata.len)
       -                return;
       -
       -        cur = &nodes[curnode];
       -
       -//        printf("DEBUG: node: %s, type: %d\n", cur->tag, cur->displaytype);
       -
       -        if (!cur->displaytype || (cur->displaytype & DisplayNone)) {
       -                /* nothing */
       -        } else if (cur->displaytype & DisplayPre) {
       -                fwrite(htmldata.data, 1, htmldata.len, stdout);
       -        } else {
       -                start = htmldata.data;
       -                e = htmldata.data + htmldata.len;
       -
       -                /* TODO: better white-space handling, for example if there is only
       -                   white-space between 2 block elements then it can be ignored. */
       -                for (s = start; s < e; s++) {
       -                        if (*s == '\r') {
       -                                continue;
       -                        } else if (isspace((unsigned char)*s)) {
       -                                if (s == start || !isspace((unsigned char)s[-1]))
       -                                        putchar(' ');
       -                        } else if (!iscntrl((unsigned char)*s)) {
       -                                putchar(*s);
       -                        }
       -                }
       -        }
       -
       -        string_clear(&htmldata);
       -}
       -
       -static void
       -xmldata(XMLParser *p, const char *data, size_t datalen)
       -{
       -        struct node *cur;
       -
       -        cur = &nodes[curnode];
       -        if (cur->displaytype & DisplayNone)
       -                return;
       -
       -        string_append(&htmldata, data, datalen);
       -}
       -
       -static void
       -xmldataentity(XMLParser *p, const char *data, size_t datalen)
       -{
       -        struct node *cur;
       -        char buf[16];
       -        int n;
       -
       -        cur = &nodes[curnode];
       -        if (cur->displaytype & DisplayNone)
       -                return;
       -
       -        /* convert basic XML entities */
       -        /* &nbsp; &copy;, copy table from Links (check license) */
       -        /* rsquo, hellip, ndash, lsquo */
       -        /* TODO: add to tscrape too */
       -        /* TODO: support some more HTML entities */
       -        n = xml_entitytostr(data, buf, sizeof(buf));
       -        if (n > 0)
       -                xmldata(p, buf, (size_t)n);
       -        else
       -                xmldata(p, data, datalen);
       -}
       -
       -static void
       -xmltagstart(XMLParser *x, const char *t, size_t tl)
       -{
       -        struct node *cur;
       -        int i;
       -
       -//        printf("start of tag: %s\n", t);
       -
       -        if (curnode >= MAX_DEPTH - 2)
       -                errx(1, "max tag depth reached: %d\n", curnode);
       -        curnode++;
       -
       -        cur = &nodes[curnode];
       -        memset(cur, 0, sizeof(*cur));
       -        cur->displaytype = DisplayInline;
       -        strlcpy(cur->tag, t, sizeof(cur->tag));
       -
       -        src[0] = '\0'; /* src, href */
       -
       -        /* set display type */
       -        for (i = 0; i < sizeof(tags) / sizeof(*tags); i++) {
       -                if (!strcasecmp(tags[i].tag, t)) {
       -                        cur->displaytype = tags[i].displaytype;
       -//                        printf("match on tag: %s == %s, displaytype: %d\n",
       -//                               tags[i].tag, t, cur->displaytype);
       -                        break;
       -                }
       -        }
       -}
       -
       -static void
       -xmltagend(XMLParser *p, const char *t, size_t tl, int isshort)
       -{
       -        struct node *cur;
       -        int i;
       -
       -        cur = &nodes[curnode];
       -
       -//        printf("DEBUG: end of tag: %s, %d, node tag: %s\n", t, cur->displaytype, cur->tag);
       -
       -        if (cur->displaytype & DisplayBlock) {
       -                fputs("\n", stdout);
       -        } else if (cur->displaytype & DisplayPre) {
       -                fputs("\n", stdout);
       -        } else if (cur->displaytype & DisplayTable) {
       -                fputs("\n", stdout);
       -        } else if (cur->displaytype & DisplayTableRow) {
       -                fputs(" | ", stdout); /* HACK: assume last cell */
       -        } else if (cur->displaytype & DisplayList) {
       -                fputs("\n", stdout);
       -        } else if (cur->displaytype & DisplayListItem) {
       -                fputs("\n", stdout);
       -        } else if (cur->displaytype & DisplayHeader) {
       -                fputs("\n", stdout);
       -                if (tl == 2 && t[0] == 'h' && t[1] >= '1' && t[1] <= '6') {
       -                        if (t[1] >= '3')
       -                                for (i = 0; i < termwidth; i++)
       -                                        putchar('-');
       -                        else if (t[1] >= '1')
       -                                for (i = 0; i < termwidth; i++)
       -                                        putchar('=');
       -                        putchar('\n');
       -                }
       -        } else if (!strcasecmp(t, "br")) {
       -                fputs("\n", stdout);
       -        }
       -
       -        curnode--;
       -}
       -
       -static void
       -xmltagstartparsed(XMLParser *p, const char *t, size_t tl, int isshort)
       -{
       -        struct node *cur;
       -        int i;
       -
       -        /* temporary replace the callback except the reader and end of tag
       -           restore the context once we receive the same ignored tag in the
       -           end tag handler */
       -        if (!strcasecmp(t, "script")) {
       -                ignorestate = endtag = "</script>";
       -                getnext = p->getnext; /* for restore */
       -                p->getnext = getnext_ignore;
       -                return;
       -        } else if (!strcasecmp(t, "style")) {
       -                ignorestate = endtag = "</style>";
       -                getnext = p->getnext; /* for restore */
       -                p->getnext = getnext_ignore;
       -                return;
       -        }
       -
       -        cur = &nodes[curnode];
       -
       -#ifdef maybe
       -        /* show links as reference at the bottom */
       -        if (src[0]) {
       -                printf(" [%d]", ++linkcount);
       -                if (!strcasecmp(t, "img") || !strcasecmp(t, "video") ||
       -                    !strcasecmp(t, "audio"))
       -                        printf("[%s]", t);
       -                /* TODO: check allocation */
       -                if (!links_head)
       -                        links_cur = links_head = ecalloc(1, sizeof(*links_head));
       -                else
       -                        links_cur = links_cur->next = ecalloc(1, sizeof(*links_head));
       -                links_cur->type = estrdup(t);
       -                /* TODO: absuri */
       -                links_cur->url = estrdup(src);
       -        }
       -        src[0] = '\0';
       -#endif
       -
       -#if 0
       -        /* show links inline */
       -        if (src[0]) {
       -                char absurl[1024];
       -                if (absuri(absurl, sizeof(absurl), src, basehref) != -1) {
       -                        if (!strcasecmp(t, "img") || !strcasecmp(t, "video") ||
       -                            !strcasecmp(t, "audio"))
       -                                printf("[%s](", t);
       -                        else
       -                                printf("[%s](", "link");
       -                        printsafe(absurl);
       -                        putchar(')');
       -                }
       -        }
       -#endif
       -
       -        if (cur->displaytype & DisplayBlock) {
       -                fputs("\n", stdout);
       -        } else if (cur->displaytype & DisplayHeader) {
       -                fputs("\n", stdout);
       -        } else if (cur->displaytype & DisplayTableRow) {
       -                fputs("\n", stdout);
       -        } else if (cur->displaytype & DisplayTableCell) {
       -                fputs(" | ", stdout);
       -        } else if (cur->displaytype & DisplayList) {
       -                fputs("\n", stdout);
       -        } else if (cur->displaytype & DisplayListItem) {
       -                /* indent nested list items */
       -                for (i = curnode; i; i--) {
       -                        if (nodes[i].displaytype & DisplayListItem)
       -                                continue;
       -                        if (nodes[i].displaytype & DisplayList)
       -                                fputs("  ", stdout);
       -                }
       -                /* TODO: for <ol>, keep list counter on ol element (parent),
       -                   support ordered number type only */
       -                fputs("* ", stdout);
       -        } else if (!strcasecmp(t, "hr")) { /* ruler */
       -                for (i = 0; i < termwidth; i++)
       -                        putchar('-');
       -        }
       -}
       -
       -static void
       -xmlattr(XMLParser *p, const char *tag, size_t taglen, const char *name,
       -        size_t namelen, const char *value, size_t valuelen)
       -{
       -        if (!strcasecmp(tag, "a") && !strcasecmp(name, "href") && valuelen)
       -                strlcpy(src, value, sizeof(src));
       -
       -        if ((!strcasecmp(tag, "img") || !strcasecmp(tag, "video") ||
       -             !strcasecmp(tag, "audio")) &&
       -            !strcasecmp(name, "src") && valuelen)
       -                strlcpy(src, value, sizeof(src));
       -}
       -
       -#ifdef maybe
       -void
       -printlinkrefs(void)
       -{
       -        size_t i;
       -
       -        printf("\n\nLink references:\n");
       -
       -        /* TODO: add title attribute or some basic description? */
       -        for (i = 1, links_cur = links_head; links_cur; links_cur = links_cur->next, i++)
       -                printf("[%zu] - %s (%s)\n", i, links_cur->url, links_cur->type);
       -}
       -#endif
       -
       -int
       -main(void)
       -{
       -        if (pledge("stdio", NULL) < 0)
       -                err(1, "pledge");
       -
       -        parser.xmlattr = xmlattr;
       -        parser.xmlcdata = xmlcdata;
       -        parser.xmldata = xmldata;
       -//        parser.xmldatastart = xmldatastart;
       -        parser.xmldataend = xmldataend;
       -        parser.xmldataentity = xmldataentity;
       -        parser.xmltagstart = xmltagstart;
       -        parser.xmltagend = xmltagend;
       -        parser.xmltagstartparsed = xmltagstartparsed;
       -
       -        parser.getnext = getchar;
       -        xml_parse(&parser);
       -
       -#ifdef maybe
       -        printlinkrefs();
       -#endif
       -        putchar('\n');
       -
       -        return 0;
       -}
 (DIR) diff --git a/webdump.c b/webdump.c
       t@@ -0,0 +1,706 @@
       +#include <ctype.h>
       +#include <err.h>
       +#include <errno.h>
       +#include <stdio.h>
       +#include <stdlib.h>
       +#include <string.h>
       +#include <strings.h>
       +#include <unistd.h>
       +
       +#include "xml.h"
       +
       +static XMLParser parser;
       +
       +#ifndef __OpenBSD__
       +#define pledge(p1,p2) 0
       +#endif
       +
       +#undef strlcat
       +size_t strlcat(char *, const char *, size_t);
       +#undef strlcpy
       +size_t strlcpy(char *, const char *, size_t);
       +
       +/* uri */
       +struct uri {
       +        char proto[48];
       +        char host[256];
       +        char path[2048];
       +        char port[6];     /* numeric port */
       +};
       +
       +static int termwidth = 72;
       +
       +#if 0
       +/* linked-list of link references */
       +struct linkref {
       +        char *type;
       +        char *url;
       +        struct linkref *next;
       +};
       +
       +static struct linkref *links_head;
       +static struct linkref *links_cur;
       +static int linkcount;
       +#endif
       +
       +enum DisplayType {
       +        DisplayUnknown     = 0,
       +        DisplayNone        = 1,
       +        DisplayPre         = 2,
       +        DisplayInline      = 4,
       +        DisplayInlineBlock = 8,
       +        DisplayBlock       = 16,
       +        DisplayList        = 32,
       +        DisplayListItem    = 64,
       +        DisplayTable       = 128,
       +        DisplayTableRow    = 256,
       +        DisplayTableCell   = 512,
       +        DisplayHeader      = 1024,
       +};
       +
       +struct node {
       +        char tag[256];
       +        enum DisplayType displaytype;
       +};
       +
       +typedef struct node Node;
       +
       +/* String data / memory pool */
       +typedef struct string {
       +        char   *data;   /* data */
       +        size_t  len;    /* string length */
       +        size_t  bufsiz; /* allocated size */
       +} String;
       +
       +int absuri(char *, size_t, const char *, const char *);
       +int parseuri(const char *, struct uri *, int);
       +
       +static char *basehref = "https://codemadness.org";
       +
       +static char src[4096]; /* src or href attribute */
       +
       +#define MAX_DEPTH 256
       +static struct node nodes[MAX_DEPTH];
       +static int curnode;
       +
       +static struct {
       +        char *tag;
       +        enum DisplayType displaytype;
       +} tags[] = {
       +        /* pre */
       +        { "pre", DisplayPre },
       +        { "code", DisplayPre },
       +        /* inline */
       +#if 0
       +        { "b", DisplayInline },
       +        { "i", DisplayInline },
       +        { "u", DisplayInline },
       +        { "strong", DisplayInline },
       +        { "em", DisplayInline },
       +        { "a", DisplayInline },
       +        { "span", DisplayInline },
       +        { "img", DisplayInline },
       +        { "label", DisplayInline },
       +#endif
       +        /* table */
       +        { "table", DisplayTable },
       +        /* table-row */
       +        { "tr", DisplayTableRow },
       +        /* table-cell */
       +        { "td", DisplayTableCell },
       +        { "th", DisplayTableCell },
       +        /* list-item */
       +        { "li", DisplayListItem },
       +        /* header */
       +        { "h1", DisplayHeader },
       +        { "h2", DisplayHeader },
       +        { "h3", DisplayHeader },
       +        { "h4", DisplayHeader },
       +        { "h5", DisplayHeader },
       +        { "h6", DisplayHeader },
       +        /* break */
       +        { "br", 0 },
       +        /* list */
       +        { "ul", DisplayList },
       +        { "ol", DisplayList },
       +        /* block */
       +        { "p", DisplayBlock },
       +        { "blockquote", DisplayBlock },
       +        { "hr", DisplayBlock },
       +        { "title", DisplayBlock },
       +        { "nav", DisplayBlock },
       +        { "main", DisplayBlock },
       +        { "article", DisplayBlock },
       +        { "header", DisplayBlock },
       +        { "footer", DisplayBlock },
       +        { "div", DisplayBlock },
       +};
       +
       +static String htmldata;
       +
       +static const char *ignorestate, *endtag;
       +static int (*getnext)(void);
       +
       +/* return a space for all data until some case-insensitive string occurs. This
       +   is used to parse incorrect HTML/XML that contains unescaped HTML in script
       +   or style tags. If you see some </script> tag in a CDATA or comment
       +   section then e-mail W3C and tell them the web is too complex. */
       +static inline int
       +getnext_ignore(void)
       +{
       +        int c;
       +
       +        if ((c = getnext()) == EOF)
       +                return EOF;
       +
       +        if (tolower(c) == tolower((unsigned char)*ignorestate)) {
       +                ignorestate++;
       +                if (*ignorestate == '\0') {
       +                        parser.getnext = getnext; /* restore */
       +                        return c;
       +                }
       +        } else {
       +                ignorestate = endtag;
       +        }
       +
       +        return ' ';
       +}
       +
       +/* Clear string only; don't free, prevents unnecessary reallocation. */
       +static void
       +string_clear(String *s)
       +{
       +        if (s->data)
       +                s->data[0] = '\0';
       +        s->len = 0;
       +}
       +
       +static void
       +string_buffer_realloc(String *s, size_t newlen)
       +{
       +        size_t alloclen;
       +
       +        for (alloclen = 64; alloclen <= newlen; alloclen *= 2)
       +                ;
       +        if (!(s->data = realloc(s->data, alloclen)))
       +                err(1, "realloc");
       +        s->bufsiz = alloclen;
       +}
       +
       +static void
       +string_append(String *s, const char *data, size_t len)
       +{
       +        if (!len)
       +                return;
       +        /* check if allocation is necesary, don't shrink buffer,
       +         * should be more than bufsiz ofcourse. */
       +        if (s->len + len >= s->bufsiz)
       +                string_buffer_realloc(s, s->len + len + 1);
       +        memcpy(s->data + s->len, data, len);
       +        s->len += len;
       +        s->data[s->len] = '\0';
       +}
       +
       +char *
       +estrdup(const char *s)
       +{
       +        char *p;
       +
       +        if (!(p = strdup(s)))
       +                err(1, "strdup");
       +        return p;
       +}
       +
       +void *
       +ecalloc(size_t nmemb, size_t size)
       +{
       +        void *p;
       +
       +        if (!(p = calloc(nmemb, size)))
       +                err(1, "calloc");
       +        return p;
       +}
       +
       +static void
       +printsafe(const char *s)
       +{
       +        for (; *s; s++) {
       +                switch (*s) {
       +                case '\t':
       +                case '\n':
       +                        putchar(*s);
       +                        break;
       +                default:
       +                        if (!iscntrl((unsigned char)*s))
       +                                putchar(*s);
       +                }
       +        }
       +}
       +
       +int
       +parseuri(const char *s, struct uri *u, int rel)
       +{
       +        const char *p = s, *b;
       +        char *endptr = NULL;
       +        size_t i;
       +        unsigned long l;
       +
       +        u->proto[0] = u->host[0] = u->path[0] = u->port[0] = '\0';
       +        if (!*s)
       +                return 0;
       +
       +        /* prefix is "//", don't read protocol, skip to domain parsing */
       +        if (!strncmp(p, "//", 2)) {
       +                p += 2; /* skip "//" */
       +        } else {
       +                /* protocol part */
       +                for (p = s; *p && (isalpha((unsigned char)*p) || isdigit((unsigned char)*p) ||
       +                               *p == '+' || *p == '-' || *p == '.'); p++)
       +                        ;
       +                if (!strncmp(p, "://", 3)) {
       +                        if ((size_t)(p - s) >= sizeof(u->proto))
       +                                return -1; /* protocol too long */
       +                        memcpy(u->proto, s, p - s);
       +                        u->proto[p - s] = '\0';
       +                        p += 3; /* skip "://" */
       +                } else {
       +                        p = s; /* no protocol format, set to start */
       +                        /* relative url: read rest as path, else as domain */
       +                        if (rel)
       +                                goto readpath;
       +                }
       +        }
       +        /* IPv6 address */
       +        if (*p == '[') {
       +                /* bracket not found or host too long */
       +                if (!(b = strchr(p, ']')) || (size_t)(b - p) < 3 ||
       +                    (size_t)(b - p) >= sizeof(u->host))
       +                        return -1;
       +                memcpy(u->host, p, b - p + 1);
       +                u->host[b - p + 1] = '\0';
       +                p = b + 1;
       +        } else {
       +                /* domain / host part, skip until port, path or end. */
       +                if ((i = strcspn(p, ":/")) >= sizeof(u->host))
       +                        return -1; /* host too long */
       +                memcpy(u->host, p, i);
       +                u->host[i] = '\0';
       +                p = &p[i];
       +        }
       +        /* port */
       +        if (*p == ':') {
       +                if ((i = strcspn(++p, "/")) >= sizeof(u->port))
       +                        return -1; /* port too long */
       +                memcpy(u->port, p, i);
       +                u->port[i] = '\0';
       +                /* check for valid port: range 1 - 65535 */
       +                errno = 0;
       +                l = strtoul(u->port, &endptr, 10);
       +                if (errno || u->port[0] == '\0' || *endptr ||
       +                    !l || l > 65535)
       +                        return -1;
       +                p = &p[i];
       +        }
       +readpath:
       +        if (u->host[0]) {
       +                p = &p[strspn(p, "/")];
       +                strlcpy(u->path, "/", sizeof(u->path));
       +        } else {
       +                /* absolute uri must have a host specified */
       +                if (!rel)
       +                        return -1;
       +        }
       +        /* treat truncation as an error */
       +        if (strlcat(u->path, p, sizeof(u->path)) >= sizeof(u->path))
       +                return -1;
       +        return 0;
       +}
       +
       +static int
       +encodeuri(char *buf, size_t bufsiz, const char *s)
       +{
       +        static const char *table = "0123456789ABCDEF";
       +        size_t i, b;
       +
       +        for (i = 0, b = 0; s[i]; i++) {
       +                if (s[i] == ' ' ||
       +                    (unsigned char)s[i] > 127 ||
       +                    iscntrl((unsigned char)s[i])) {
       +                        if (b + 3 >= bufsiz)
       +                                return -1;
       +                        buf[b++] = '%';
       +                        buf[b++] = table[((unsigned char)s[i] >> 4) & 15];
       +                        buf[b++] = table[(unsigned char)s[i] & 15];
       +                } else if (b < bufsiz) {
       +                        buf[b++] = s[i];
       +                } else {
       +                        return -1;
       +                }
       +        }
       +        if (b >= bufsiz)
       +                return -1;
       +        buf[b] = '\0';
       +
       +        return 0;
       +}
       +
       +/* Get absolute uri; if `link` is relative use `base` to make it absolute.
       + * the returned string in `buf` is uri encoded, see: encodeuri(). */
       +int
       +absuri(char *buf, size_t bufsiz, const char *link, const char *base)
       +{
       +        struct uri ulink, ubase;
       +        char tmp[4096], *host, *p, *port;
       +        int c, r;
       +        size_t i;
       +
       +        buf[0] = '\0';
       +        if (parseuri(base, &ubase, 0) == -1 ||
       +            parseuri(link, &ulink, 1) == -1 ||
       +            (!ulink.host[0] && !ubase.host[0]))
       +                return -1;
       +
       +        if (!strncmp(link, "//", 2)) {
       +                host = ulink.host;
       +                port = ulink.port;
       +        } else {
       +                host = ulink.host[0] ? ulink.host : ubase.host;
       +                port = ulink.port[0] ? ulink.port : ubase.port;
       +        }
       +        r = snprintf(tmp, sizeof(tmp), "%s://%s%s%s",
       +                ulink.proto[0] ?
       +                        ulink.proto :
       +                        (ubase.proto[0] ? ubase.proto : "http"),
       +                host,
       +                port[0] ? ":" : "",
       +                port);
       +        if (r < 0 || (size_t)r >= sizeof(tmp))
       +                return -1; /* error or truncation */
       +
       +        /* relative to root */
       +        if (!ulink.host[0] && ulink.path[0] != '/') {
       +                /* relative to base url path */
       +                if (ulink.path[0]) {
       +                        if ((p = strrchr(ubase.path, '/'))) {
       +                                /* temporary null-terminate */
       +                                c = *(++p);
       +                                *p = '\0';
       +                                i = strlcat(tmp, ubase.path, sizeof(tmp));
       +                                *p = c; /* restore */
       +                                if (i >= sizeof(tmp))
       +                                        return -1;
       +                        }
       +                } else if (strlcat(tmp, ubase.path, sizeof(tmp)) >=
       +                           sizeof(tmp)) {
       +                        return -1;
       +                }
       +        }
       +        if (strlcat(tmp, ulink.path, sizeof(tmp)) >= sizeof(tmp))
       +                return -1;
       +
       +        return encodeuri(buf, bufsiz, tmp);
       +}
       +
       +static void
       +xmlcdata(XMLParser *p, const char *data, size_t datalen)
       +{
       +        struct node *cur;
       +
       +        cur = &nodes[curnode];
       +        if (cur->displaytype & DisplayNone)
       +                return;
       +
       +        printsafe(data);
       +}
       +
       +#if 0
       +static void
       +xmldatastart(XMLParser *p)
       +{
       +//        printf("DEBUG: %s\n", __func__);
       +}
       +#endif
       +
       +static void
       +xmldataend(XMLParser *p)
       +{
       +        struct node *cur;
       +        char *start, *s, *e;
       +
       +//        printf("DEBUG: %s\n", __func__);
       +
       +        if (!htmldata.data || !htmldata.len)
       +                return;
       +
       +        cur = &nodes[curnode];
       +
       +//        printf("DEBUG: node: %s, type: %d\n", cur->tag, cur->displaytype);
       +
       +        if (!cur->displaytype || (cur->displaytype & DisplayNone)) {
       +                /* nothing */
       +        } else if (cur->displaytype & DisplayPre) {
       +                fwrite(htmldata.data, 1, htmldata.len, stdout);
       +        } else {
       +                start = htmldata.data;
       +                e = htmldata.data + htmldata.len;
       +
       +                /* TODO: better white-space handling, for example if there is only
       +                   white-space between 2 block elements then it can be ignored. */
       +                for (s = start; s < e; s++) {
       +                        if (*s == '\r') {
       +                                continue;
       +                        } else if (isspace((unsigned char)*s)) {
       +                                if (s == start || !isspace((unsigned char)s[-1]))
       +                                        putchar(' ');
       +                        } else if (!iscntrl((unsigned char)*s)) {
       +                                putchar(*s);
       +                        }
       +                }
       +        }
       +
       +        string_clear(&htmldata);
       +}
       +
       +static void
       +xmldata(XMLParser *p, const char *data, size_t datalen)
       +{
       +        struct node *cur;
       +
       +        cur = &nodes[curnode];
       +        if (cur->displaytype & DisplayNone)
       +                return;
       +
       +        string_append(&htmldata, data, datalen);
       +}
       +
       +static void
       +xmldataentity(XMLParser *p, const char *data, size_t datalen)
       +{
       +        struct node *cur;
       +        char buf[16];
       +        int n;
       +
       +        cur = &nodes[curnode];
       +        if (cur->displaytype & DisplayNone)
       +                return;
       +
       +        /* convert basic XML entities */
       +        /* &nbsp; &copy;, copy table from Links (check license) */
       +        /* rsquo, hellip, ndash, lsquo */
       +        /* TODO: add to tscrape too */
       +        /* TODO: support some more HTML entities */
       +        n = xml_entitytostr(data, buf, sizeof(buf));
       +        if (n > 0)
       +                xmldata(p, buf, (size_t)n);
       +        else
       +                xmldata(p, data, datalen);
       +}
       +
       +static void
       +xmltagstart(XMLParser *x, const char *t, size_t tl)
       +{
       +        struct node *cur;
       +        int i;
       +
       +//        printf("start of tag: %s\n", t);
       +
       +        if (curnode >= MAX_DEPTH - 2)
       +                errx(1, "max tag depth reached: %d\n", curnode);
       +        curnode++;
       +
       +        cur = &nodes[curnode];
       +        memset(cur, 0, sizeof(*cur));
       +        cur->displaytype = DisplayInline;
       +        strlcpy(cur->tag, t, sizeof(cur->tag));
       +
       +        src[0] = '\0'; /* src, href */
       +
       +        /* set display type */
       +        for (i = 0; i < sizeof(tags) / sizeof(*tags); i++) {
       +                if (!strcasecmp(tags[i].tag, t)) {
       +                        cur->displaytype = tags[i].displaytype;
       +//                        printf("match on tag: %s == %s, displaytype: %d\n",
       +//                               tags[i].tag, t, cur->displaytype);
       +                        break;
       +                }
       +        }
       +}
       +
       +static void
       +xmltagend(XMLParser *p, const char *t, size_t tl, int isshort)
       +{
       +        struct node *cur;
       +        int i;
       +
       +        cur = &nodes[curnode];
       +
       +//        printf("DEBUG: end of tag: %s, %d, node tag: %s\n", t, cur->displaytype, cur->tag);
       +
       +        if (cur->displaytype & DisplayBlock) {
       +                fputs("\n", stdout);
       +        } else if (cur->displaytype & DisplayPre) {
       +                fputs("\n", stdout);
       +        } else if (cur->displaytype & DisplayTable) {
       +                fputs("\n", stdout);
       +        } else if (cur->displaytype & DisplayTableRow) {
       +                fputs(" | ", stdout); /* HACK: assume last cell */
       +        } else if (cur->displaytype & DisplayList) {
       +                fputs("\n", stdout);
       +        } else if (cur->displaytype & DisplayListItem) {
       +                fputs("\n", stdout);
       +        } else if (cur->displaytype & DisplayHeader) {
       +                fputs("\n", stdout);
       +                if (tl == 2 && t[0] == 'h' && t[1] >= '1' && t[1] <= '6') {
       +                        if (t[1] >= '3')
       +                                for (i = 0; i < termwidth; i++)
       +                                        putchar('-');
       +                        else if (t[1] >= '1')
       +                                for (i = 0; i < termwidth; i++)
       +                                        putchar('=');
       +                        putchar('\n');
       +                }
       +        } else if (!strcasecmp(t, "br")) {
       +                fputs("\n", stdout);
       +        }
       +
       +        curnode--;
       +}
       +
       +static void
       +xmltagstartparsed(XMLParser *p, const char *t, size_t tl, int isshort)
       +{
       +        struct node *cur;
       +        int i;
       +
       +        /* temporary replace the callback except the reader and end of tag
       +           restore the context once we receive the same ignored tag in the
       +           end tag handler */
       +        if (!strcasecmp(t, "script")) {
       +                ignorestate = endtag = "</script>";
       +                getnext = p->getnext; /* for restore */
       +                p->getnext = getnext_ignore;
       +                return;
       +        } else if (!strcasecmp(t, "style")) {
       +                ignorestate = endtag = "</style>";
       +                getnext = p->getnext; /* for restore */
       +                p->getnext = getnext_ignore;
       +                return;
       +        }
       +
       +        cur = &nodes[curnode];
       +
       +#ifdef maybe
       +        /* show links as reference at the bottom */
       +        if (src[0]) {
       +                printf(" [%d]", ++linkcount);
       +                if (!strcasecmp(t, "img") || !strcasecmp(t, "video") ||
       +                    !strcasecmp(t, "audio"))
       +                        printf("[%s]", t);
       +                /* TODO: check allocation */
       +                if (!links_head)
       +                        links_cur = links_head = ecalloc(1, sizeof(*links_head));
       +                else
       +                        links_cur = links_cur->next = ecalloc(1, sizeof(*links_head));
       +                links_cur->type = estrdup(t);
       +                /* TODO: absuri */
       +                links_cur->url = estrdup(src);
       +        }
       +        src[0] = '\0';
       +#endif
       +
       +#if 0
       +        /* show links inline */
       +        if (src[0]) {
       +                char absurl[1024];
       +                if (absuri(absurl, sizeof(absurl), src, basehref) != -1) {
       +                        if (!strcasecmp(t, "img") || !strcasecmp(t, "video") ||
       +                            !strcasecmp(t, "audio"))
       +                                printf("[%s](", t);
       +                        else
       +                                printf("[%s](", "link");
       +                        printsafe(absurl);
       +                        putchar(')');
       +                }
       +        }
       +#endif
       +
       +        if (cur->displaytype & DisplayBlock) {
       +                fputs("\n", stdout);
       +        } else if (cur->displaytype & DisplayHeader) {
       +                fputs("\n", stdout);
       +        } else if (cur->displaytype & DisplayTableRow) {
       +                fputs("\n", stdout);
       +        } else if (cur->displaytype & DisplayTableCell) {
       +                fputs(" | ", stdout);
       +        } else if (cur->displaytype & DisplayList) {
       +                fputs("\n", stdout);
       +        } else if (cur->displaytype & DisplayListItem) {
       +                /* indent nested list items */
       +                for (i = curnode; i; i--) {
       +                        if (nodes[i].displaytype & DisplayListItem)
       +                                continue;
       +                        if (nodes[i].displaytype & DisplayList)
       +                                fputs("  ", stdout);
       +                }
       +                /* TODO: for <ol>, keep list counter on ol element (parent),
       +                   support ordered number type only */
       +                fputs("* ", stdout);
       +        } else if (!strcasecmp(t, "hr")) { /* ruler */
       +                for (i = 0; i < termwidth; i++)
       +                        putchar('-');
       +        }
       +}
       +
       +static void
       +xmlattr(XMLParser *p, const char *tag, size_t taglen, const char *name,
       +        size_t namelen, const char *value, size_t valuelen)
       +{
       +        if (!strcasecmp(tag, "a") && !strcasecmp(name, "href") && valuelen)
       +                strlcpy(src, value, sizeof(src));
       +
       +        if ((!strcasecmp(tag, "img") || !strcasecmp(tag, "video") ||
       +             !strcasecmp(tag, "audio")) &&
       +            !strcasecmp(name, "src") && valuelen)
       +                strlcpy(src, value, sizeof(src));
       +}
       +
       +#ifdef maybe
       +void
       +printlinkrefs(void)
       +{
       +        size_t i;
       +
       +        printf("\n\nLink references:\n");
       +
       +        /* TODO: add title attribute or some basic description? */
       +        for (i = 1, links_cur = links_head; links_cur; links_cur = links_cur->next, i++)
       +                printf("[%zu] - %s (%s)\n", i, links_cur->url, links_cur->type);
       +}
       +#endif
       +
       +int
       +main(void)
       +{
       +        if (pledge("stdio", NULL) < 0)
       +                err(1, "pledge");
       +
       +        parser.xmlattr = xmlattr;
       +        parser.xmlcdata = xmlcdata;
       +        parser.xmldata = xmldata;
       +//        parser.xmldatastart = xmldatastart;
       +        parser.xmldataend = xmldataend;
       +        parser.xmldataentity = xmldataentity;
       +        parser.xmltagstart = xmltagstart;
       +        parser.xmltagend = xmltagend;
       +        parser.xmltagstartparsed = xmltagstartparsed;
       +
       +        parser.getnext = getchar;
       +        xml_parse(&parser);
       +
       +#ifdef maybe
       +        printlinkrefs();
       +#endif
       +        putchar('\n');
       +
       +        return 0;
       +}