trename main.c to webdump.c - webdump - [FORK] git://git.codemadness.org/webdump
(HTM) git clone git://git.z3bra.org/webdump.git
(DIR) Log
(DIR) Files
(DIR) Refs
(DIR) README
(DIR) LICENSE
---
(DIR) commit b82529ac7152b6326161c23b267d7719090ba168
(DIR) parent f3f8b7d8e8f4b72c072488b524cfd0b08791fdb4
(HTM) Author: Hiltjo Posthuma <hiltjo@codemadness.org>
Date: Sun, 22 Sep 2019 19:14:41 +0200
rename main.c to webdump.c
Diffstat:
D main.c | 697 ------------------------------
A webdump.c | 706 +++++++++++++++++++++++++++++++
2 files changed, 706 insertions(+), 697 deletions(-)
---
(DIR) diff --git a/main.c b/main.c
t@@ -1,697 +0,0 @@
-#include <ctype.h>
-#include <err.h>
-#include <errno.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <strings.h>
-#include <unistd.h>
-
-#include "xml.h"
-
-static XMLParser parser;
-
-/* uri */
-struct uri {
- char proto[48];
- char host[256];
- char path[2048];
- char port[6]; /* numeric port */
-};
-
-static int termwidth = 72;
-
-#if 0
-/* linked-list of link references */
-struct linkref {
- char *type;
- char *url;
- struct linkref *next;
-};
-
-static struct linkref *links_head;
-static struct linkref *links_cur;
-static int linkcount;
-#endif
-
-enum DisplayType {
- DisplayUnknown = 0,
- DisplayNone = 1,
- DisplayPre = 2,
- DisplayInline = 4,
- DisplayInlineBlock = 8,
- DisplayBlock = 16,
- DisplayList = 32,
- DisplayListItem = 64,
- DisplayTable = 128,
- DisplayTableRow = 256,
- DisplayTableCell = 512,
- DisplayHeader = 1024,
-};
-
-struct node {
- char tag[256];
- enum DisplayType displaytype;
-};
-
-typedef struct node Node;
-
-/* String data / memory pool */
-typedef struct string {
- char *data; /* data */
- size_t len; /* string length */
- size_t bufsiz; /* allocated size */
-} String;
-
-int absuri(char *, size_t, const char *, const char *);
-int parseuri(const char *, struct uri *, int);
-
-static char *basehref = "https://codemadness.org";
-
-static char src[4096]; /* src or href attribute */
-
-#define MAX_DEPTH 256
-static struct node nodes[MAX_DEPTH];
-static int curnode;
-
-static struct {
- char *tag;
- enum DisplayType displaytype;
-} tags[] = {
- /* pre */
- { "pre", DisplayPre },
- { "code", DisplayPre },
- /* inline */
-#if 0
- { "b", DisplayInline },
- { "i", DisplayInline },
- { "u", DisplayInline },
- { "strong", DisplayInline },
- { "em", DisplayInline },
- { "a", DisplayInline },
- { "span", DisplayInline },
- { "img", DisplayInline },
- { "label", DisplayInline },
-#endif
- /* table */
- { "table", DisplayTable },
- /* table-row */
- { "tr", DisplayTableRow },
- /* table-cell */
- { "td", DisplayTableCell },
- { "th", DisplayTableCell },
- /* list-item */
- { "li", DisplayListItem },
- /* header */
- { "h1", DisplayHeader },
- { "h2", DisplayHeader },
- { "h3", DisplayHeader },
- { "h4", DisplayHeader },
- { "h5", DisplayHeader },
- { "h6", DisplayHeader },
- /* break */
- { "br", 0 },
- /* list */
- { "ul", DisplayList },
- { "ol", DisplayList },
- /* block */
- { "p", DisplayBlock },
- { "blockquote", DisplayBlock },
- { "hr", DisplayBlock },
- { "title", DisplayBlock },
- { "nav", DisplayBlock },
- { "main", DisplayBlock },
- { "article", DisplayBlock },
- { "header", DisplayBlock },
- { "footer", DisplayBlock },
- { "div", DisplayBlock },
-};
-
-static String htmldata;
-
-static const char *ignorestate, *endtag;
-static int (*getnext)(void);
-
-/* return a space for all data until some case-insensitive string occurs. This
- is used to parse incorrect HTML/XML that contains unescaped HTML in script
- or style tags. If you see some </script> tag in a CDATA or comment
- section then e-mail W3C and tell them the web is too complex. */
-static inline int
-getnext_ignore(void)
-{
- int c;
-
- if ((c = getnext()) == EOF)
- return EOF;
-
- if (tolower(c) == tolower((unsigned char)*ignorestate)) {
- ignorestate++;
- if (*ignorestate == '\0') {
- parser.getnext = getnext; /* restore */
- return c;
- }
- } else {
- ignorestate = endtag;
- }
-
- return ' ';
-}
-
-/* Clear string only; don't free, prevents unnecessary reallocation. */
-static void
-string_clear(String *s)
-{
- if (s->data)
- s->data[0] = '\0';
- s->len = 0;
-}
-
-static void
-string_buffer_realloc(String *s, size_t newlen)
-{
- size_t alloclen;
-
- for (alloclen = 64; alloclen <= newlen; alloclen *= 2)
- ;
- if (!(s->data = realloc(s->data, alloclen)))
- err(1, "realloc");
- s->bufsiz = alloclen;
-}
-
-static void
-string_append(String *s, const char *data, size_t len)
-{
- if (!len)
- return;
- /* check if allocation is necesary, don't shrink buffer,
- * should be more than bufsiz ofcourse. */
- if (s->len + len >= s->bufsiz)
- string_buffer_realloc(s, s->len + len + 1);
- memcpy(s->data + s->len, data, len);
- s->len += len;
- s->data[s->len] = '\0';
-}
-
-char *
-estrdup(const char *s)
-{
- char *p;
-
- if (!(p = strdup(s)))
- err(1, "strdup");
- return p;
-}
-
-void *
-ecalloc(size_t nmemb, size_t size)
-{
- void *p;
-
- if (!(p = calloc(nmemb, size)))
- err(1, "calloc");
- return p;
-}
-
-static void
-printsafe(const char *s)
-{
- for (; *s; s++) {
- switch (*s) {
- case '\t':
- case '\n':
- putchar(*s);
- break;
- default:
- if (!iscntrl((unsigned char)*s))
- putchar(*s);
- }
- }
-}
-
-int
-parseuri(const char *s, struct uri *u, int rel)
-{
- const char *p = s, *b;
- char *endptr = NULL;
- size_t i;
- unsigned long l;
-
- u->proto[0] = u->host[0] = u->path[0] = u->port[0] = '\0';
- if (!*s)
- return 0;
-
- /* prefix is "//", don't read protocol, skip to domain parsing */
- if (!strncmp(p, "//", 2)) {
- p += 2; /* skip "//" */
- } else {
- /* protocol part */
- for (p = s; *p && (isalpha((unsigned char)*p) || isdigit((unsigned char)*p) ||
- *p == '+' || *p == '-' || *p == '.'); p++)
- ;
- if (!strncmp(p, "://", 3)) {
- if ((size_t)(p - s) >= sizeof(u->proto))
- return -1; /* protocol too long */
- memcpy(u->proto, s, p - s);
- u->proto[p - s] = '\0';
- p += 3; /* skip "://" */
- } else {
- p = s; /* no protocol format, set to start */
- /* relative url: read rest as path, else as domain */
- if (rel)
- goto readpath;
- }
- }
- /* IPv6 address */
- if (*p == '[') {
- /* bracket not found or host too long */
- if (!(b = strchr(p, ']')) || (size_t)(b - p) < 3 ||
- (size_t)(b - p) >= sizeof(u->host))
- return -1;
- memcpy(u->host, p, b - p + 1);
- u->host[b - p + 1] = '\0';
- p = b + 1;
- } else {
- /* domain / host part, skip until port, path or end. */
- if ((i = strcspn(p, ":/")) >= sizeof(u->host))
- return -1; /* host too long */
- memcpy(u->host, p, i);
- u->host[i] = '\0';
- p = &p[i];
- }
- /* port */
- if (*p == ':') {
- if ((i = strcspn(++p, "/")) >= sizeof(u->port))
- return -1; /* port too long */
- memcpy(u->port, p, i);
- u->port[i] = '\0';
- /* check for valid port: range 1 - 65535 */
- errno = 0;
- l = strtoul(u->port, &endptr, 10);
- if (errno || u->port[0] == '\0' || *endptr ||
- !l || l > 65535)
- return -1;
- p = &p[i];
- }
-readpath:
- if (u->host[0]) {
- p = &p[strspn(p, "/")];
- strlcpy(u->path, "/", sizeof(u->path));
- } else {
- /* absolute uri must have a host specified */
- if (!rel)
- return -1;
- }
- /* treat truncation as an error */
- if (strlcat(u->path, p, sizeof(u->path)) >= sizeof(u->path))
- return -1;
- return 0;
-}
-
-static int
-encodeuri(char *buf, size_t bufsiz, const char *s)
-{
- static const char *table = "0123456789ABCDEF";
- size_t i, b;
-
- for (i = 0, b = 0; s[i]; i++) {
- if (s[i] == ' ' ||
- (unsigned char)s[i] > 127 ||
- iscntrl((unsigned char)s[i])) {
- if (b + 3 >= bufsiz)
- return -1;
- buf[b++] = '%';
- buf[b++] = table[((unsigned char)s[i] >> 4) & 15];
- buf[b++] = table[(unsigned char)s[i] & 15];
- } else if (b < bufsiz) {
- buf[b++] = s[i];
- } else {
- return -1;
- }
- }
- if (b >= bufsiz)
- return -1;
- buf[b] = '\0';
-
- return 0;
-}
-
-/* Get absolute uri; if `link` is relative use `base` to make it absolute.
- * the returned string in `buf` is uri encoded, see: encodeuri(). */
-int
-absuri(char *buf, size_t bufsiz, const char *link, const char *base)
-{
- struct uri ulink, ubase;
- char tmp[4096], *host, *p, *port;
- int c, r;
- size_t i;
-
- buf[0] = '\0';
- if (parseuri(base, &ubase, 0) == -1 ||
- parseuri(link, &ulink, 1) == -1 ||
- (!ulink.host[0] && !ubase.host[0]))
- return -1;
-
- if (!strncmp(link, "//", 2)) {
- host = ulink.host;
- port = ulink.port;
- } else {
- host = ulink.host[0] ? ulink.host : ubase.host;
- port = ulink.port[0] ? ulink.port : ubase.port;
- }
- r = snprintf(tmp, sizeof(tmp), "%s://%s%s%s",
- ulink.proto[0] ?
- ulink.proto :
- (ubase.proto[0] ? ubase.proto : "http"),
- host,
- port[0] ? ":" : "",
- port);
- if (r < 0 || (size_t)r >= sizeof(tmp))
- return -1; /* error or truncation */
-
- /* relative to root */
- if (!ulink.host[0] && ulink.path[0] != '/') {
- /* relative to base url path */
- if (ulink.path[0]) {
- if ((p = strrchr(ubase.path, '/'))) {
- /* temporary null-terminate */
- c = *(++p);
- *p = '\0';
- i = strlcat(tmp, ubase.path, sizeof(tmp));
- *p = c; /* restore */
- if (i >= sizeof(tmp))
- return -1;
- }
- } else if (strlcat(tmp, ubase.path, sizeof(tmp)) >=
- sizeof(tmp)) {
- return -1;
- }
- }
- if (strlcat(tmp, ulink.path, sizeof(tmp)) >= sizeof(tmp))
- return -1;
-
- return encodeuri(buf, bufsiz, tmp);
-}
-
-static void
-xmlcdata(XMLParser *p, const char *data, size_t datalen)
-{
- struct node *cur;
-
- cur = &nodes[curnode];
- if (cur->displaytype & DisplayNone)
- return;
-
- printsafe(data);
-}
-
-#if 0
-static void
-xmldatastart(XMLParser *p)
-{
-// printf("DEBUG: %s\n", __func__);
-}
-#endif
-
-static void
-xmldataend(XMLParser *p)
-{
- struct node *cur;
- char *start, *s, *e;
-
-// printf("DEBUG: %s\n", __func__);
-
- if (!htmldata.data || !htmldata.len)
- return;
-
- cur = &nodes[curnode];
-
-// printf("DEBUG: node: %s, type: %d\n", cur->tag, cur->displaytype);
-
- if (!cur->displaytype || (cur->displaytype & DisplayNone)) {
- /* nothing */
- } else if (cur->displaytype & DisplayPre) {
- fwrite(htmldata.data, 1, htmldata.len, stdout);
- } else {
- start = htmldata.data;
- e = htmldata.data + htmldata.len;
-
- /* TODO: better white-space handling, for example if there is only
- white-space between 2 block elements then it can be ignored. */
- for (s = start; s < e; s++) {
- if (*s == '\r') {
- continue;
- } else if (isspace((unsigned char)*s)) {
- if (s == start || !isspace((unsigned char)s[-1]))
- putchar(' ');
- } else if (!iscntrl((unsigned char)*s)) {
- putchar(*s);
- }
- }
- }
-
- string_clear(&htmldata);
-}
-
-static void
-xmldata(XMLParser *p, const char *data, size_t datalen)
-{
- struct node *cur;
-
- cur = &nodes[curnode];
- if (cur->displaytype & DisplayNone)
- return;
-
- string_append(&htmldata, data, datalen);
-}
-
-static void
-xmldataentity(XMLParser *p, const char *data, size_t datalen)
-{
- struct node *cur;
- char buf[16];
- int n;
-
- cur = &nodes[curnode];
- if (cur->displaytype & DisplayNone)
- return;
-
- /* convert basic XML entities */
- /* ©, copy table from Links (check license) */
- /* rsquo, hellip, ndash, lsquo */
- /* TODO: add to tscrape too */
- /* TODO: support some more HTML entities */
- n = xml_entitytostr(data, buf, sizeof(buf));
- if (n > 0)
- xmldata(p, buf, (size_t)n);
- else
- xmldata(p, data, datalen);
-}
-
-static void
-xmltagstart(XMLParser *x, const char *t, size_t tl)
-{
- struct node *cur;
- int i;
-
-// printf("start of tag: %s\n", t);
-
- if (curnode >= MAX_DEPTH - 2)
- errx(1, "max tag depth reached: %d\n", curnode);
- curnode++;
-
- cur = &nodes[curnode];
- memset(cur, 0, sizeof(*cur));
- cur->displaytype = DisplayInline;
- strlcpy(cur->tag, t, sizeof(cur->tag));
-
- src[0] = '\0'; /* src, href */
-
- /* set display type */
- for (i = 0; i < sizeof(tags) / sizeof(*tags); i++) {
- if (!strcasecmp(tags[i].tag, t)) {
- cur->displaytype = tags[i].displaytype;
-// printf("match on tag: %s == %s, displaytype: %d\n",
-// tags[i].tag, t, cur->displaytype);
- break;
- }
- }
-}
-
-static void
-xmltagend(XMLParser *p, const char *t, size_t tl, int isshort)
-{
- struct node *cur;
- int i;
-
- cur = &nodes[curnode];
-
-// printf("DEBUG: end of tag: %s, %d, node tag: %s\n", t, cur->displaytype, cur->tag);
-
- if (cur->displaytype & DisplayBlock) {
- fputs("\n", stdout);
- } else if (cur->displaytype & DisplayPre) {
- fputs("\n", stdout);
- } else if (cur->displaytype & DisplayTable) {
- fputs("\n", stdout);
- } else if (cur->displaytype & DisplayTableRow) {
- fputs(" | ", stdout); /* HACK: assume last cell */
- } else if (cur->displaytype & DisplayList) {
- fputs("\n", stdout);
- } else if (cur->displaytype & DisplayListItem) {
- fputs("\n", stdout);
- } else if (cur->displaytype & DisplayHeader) {
- fputs("\n", stdout);
- if (tl == 2 && t[0] == 'h' && t[1] >= '1' && t[1] <= '6') {
- if (t[1] >= '3')
- for (i = 0; i < termwidth; i++)
- putchar('-');
- else if (t[1] >= '1')
- for (i = 0; i < termwidth; i++)
- putchar('=');
- putchar('\n');
- }
- } else if (!strcasecmp(t, "br")) {
- fputs("\n", stdout);
- }
-
- curnode--;
-}
-
-static void
-xmltagstartparsed(XMLParser *p, const char *t, size_t tl, int isshort)
-{
- struct node *cur;
- int i;
-
- /* temporary replace the callback except the reader and end of tag
- restore the context once we receive the same ignored tag in the
- end tag handler */
- if (!strcasecmp(t, "script")) {
- ignorestate = endtag = "</script>";
- getnext = p->getnext; /* for restore */
- p->getnext = getnext_ignore;
- return;
- } else if (!strcasecmp(t, "style")) {
- ignorestate = endtag = "</style>";
- getnext = p->getnext; /* for restore */
- p->getnext = getnext_ignore;
- return;
- }
-
- cur = &nodes[curnode];
-
-#ifdef maybe
- /* show links as reference at the bottom */
- if (src[0]) {
- printf(" [%d]", ++linkcount);
- if (!strcasecmp(t, "img") || !strcasecmp(t, "video") ||
- !strcasecmp(t, "audio"))
- printf("[%s]", t);
- /* TODO: check allocation */
- if (!links_head)
- links_cur = links_head = ecalloc(1, sizeof(*links_head));
- else
- links_cur = links_cur->next = ecalloc(1, sizeof(*links_head));
- links_cur->type = estrdup(t);
- /* TODO: absuri */
- links_cur->url = estrdup(src);
- }
- src[0] = '\0';
-#endif
-
-#if 0
- /* show links inline */
- if (src[0]) {
- char absurl[1024];
- if (absuri(absurl, sizeof(absurl), src, basehref) != -1) {
- if (!strcasecmp(t, "img") || !strcasecmp(t, "video") ||
- !strcasecmp(t, "audio"))
- printf("[%s](", t);
- else
- printf("[%s](", "link");
- printsafe(absurl);
- putchar(')');
- }
- }
-#endif
-
- if (cur->displaytype & DisplayBlock) {
- fputs("\n", stdout);
- } else if (cur->displaytype & DisplayHeader) {
- fputs("\n", stdout);
- } else if (cur->displaytype & DisplayTableRow) {
- fputs("\n", stdout);
- } else if (cur->displaytype & DisplayTableCell) {
- fputs(" | ", stdout);
- } else if (cur->displaytype & DisplayList) {
- fputs("\n", stdout);
- } else if (cur->displaytype & DisplayListItem) {
- /* indent nested list items */
- for (i = curnode; i; i--) {
- if (nodes[i].displaytype & DisplayListItem)
- continue;
- if (nodes[i].displaytype & DisplayList)
- fputs(" ", stdout);
- }
- /* TODO: for <ol>, keep list counter on ol element (parent),
- support ordered number type only */
- fputs("* ", stdout);
- } else if (!strcasecmp(t, "hr")) { /* ruler */
- for (i = 0; i < termwidth; i++)
- putchar('-');
- }
-}
-
-static void
-xmlattr(XMLParser *p, const char *tag, size_t taglen, const char *name,
- size_t namelen, const char *value, size_t valuelen)
-{
- if (!strcasecmp(tag, "a") && !strcasecmp(name, "href") && valuelen)
- strlcpy(src, value, sizeof(src));
-
- if ((!strcasecmp(tag, "img") || !strcasecmp(tag, "video") ||
- !strcasecmp(tag, "audio")) &&
- !strcasecmp(name, "src") && valuelen)
- strlcpy(src, value, sizeof(src));
-}
-
-#ifdef maybe
-void
-printlinkrefs(void)
-{
- size_t i;
-
- printf("\n\nLink references:\n");
-
- /* TODO: add title attribute or some basic description? */
- for (i = 1, links_cur = links_head; links_cur; links_cur = links_cur->next, i++)
- printf("[%zu] - %s (%s)\n", i, links_cur->url, links_cur->type);
-}
-#endif
-
-int
-main(void)
-{
- if (pledge("stdio", NULL) < 0)
- err(1, "pledge");
-
- parser.xmlattr = xmlattr;
- parser.xmlcdata = xmlcdata;
- parser.xmldata = xmldata;
-// parser.xmldatastart = xmldatastart;
- parser.xmldataend = xmldataend;
- parser.xmldataentity = xmldataentity;
- parser.xmltagstart = xmltagstart;
- parser.xmltagend = xmltagend;
- parser.xmltagstartparsed = xmltagstartparsed;
-
- parser.getnext = getchar;
- xml_parse(&parser);
-
-#ifdef maybe
- printlinkrefs();
-#endif
- putchar('\n');
-
- return 0;
-}
(DIR) diff --git a/webdump.c b/webdump.c
t@@ -0,0 +1,706 @@
+#include <ctype.h>
+#include <err.h>
+#include <errno.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <strings.h>
+#include <unistd.h>
+
+#include "xml.h"
+
+static XMLParser parser;
+
+#ifndef __OpenBSD__
+#define pledge(p1,p2) 0
+#endif
+
+#undef strlcat
+size_t strlcat(char *, const char *, size_t);
+#undef strlcpy
+size_t strlcpy(char *, const char *, size_t);
+
+/* uri */
+struct uri {
+ char proto[48];
+ char host[256];
+ char path[2048];
+ char port[6]; /* numeric port */
+};
+
+static int termwidth = 72;
+
+#if 0
+/* linked-list of link references */
+struct linkref {
+ char *type;
+ char *url;
+ struct linkref *next;
+};
+
+static struct linkref *links_head;
+static struct linkref *links_cur;
+static int linkcount;
+#endif
+
+enum DisplayType {
+ DisplayUnknown = 0,
+ DisplayNone = 1,
+ DisplayPre = 2,
+ DisplayInline = 4,
+ DisplayInlineBlock = 8,
+ DisplayBlock = 16,
+ DisplayList = 32,
+ DisplayListItem = 64,
+ DisplayTable = 128,
+ DisplayTableRow = 256,
+ DisplayTableCell = 512,
+ DisplayHeader = 1024,
+};
+
+struct node {
+ char tag[256];
+ enum DisplayType displaytype;
+};
+
+typedef struct node Node;
+
+/* String data / memory pool */
+typedef struct string {
+ char *data; /* data */
+ size_t len; /* string length */
+ size_t bufsiz; /* allocated size */
+} String;
+
+int absuri(char *, size_t, const char *, const char *);
+int parseuri(const char *, struct uri *, int);
+
+static char *basehref = "https://codemadness.org";
+
+static char src[4096]; /* src or href attribute */
+
+#define MAX_DEPTH 256
+static struct node nodes[MAX_DEPTH];
+static int curnode;
+
+static struct {
+ char *tag;
+ enum DisplayType displaytype;
+} tags[] = {
+ /* pre */
+ { "pre", DisplayPre },
+ { "code", DisplayPre },
+ /* inline */
+#if 0
+ { "b", DisplayInline },
+ { "i", DisplayInline },
+ { "u", DisplayInline },
+ { "strong", DisplayInline },
+ { "em", DisplayInline },
+ { "a", DisplayInline },
+ { "span", DisplayInline },
+ { "img", DisplayInline },
+ { "label", DisplayInline },
+#endif
+ /* table */
+ { "table", DisplayTable },
+ /* table-row */
+ { "tr", DisplayTableRow },
+ /* table-cell */
+ { "td", DisplayTableCell },
+ { "th", DisplayTableCell },
+ /* list-item */
+ { "li", DisplayListItem },
+ /* header */
+ { "h1", DisplayHeader },
+ { "h2", DisplayHeader },
+ { "h3", DisplayHeader },
+ { "h4", DisplayHeader },
+ { "h5", DisplayHeader },
+ { "h6", DisplayHeader },
+ /* break */
+ { "br", 0 },
+ /* list */
+ { "ul", DisplayList },
+ { "ol", DisplayList },
+ /* block */
+ { "p", DisplayBlock },
+ { "blockquote", DisplayBlock },
+ { "hr", DisplayBlock },
+ { "title", DisplayBlock },
+ { "nav", DisplayBlock },
+ { "main", DisplayBlock },
+ { "article", DisplayBlock },
+ { "header", DisplayBlock },
+ { "footer", DisplayBlock },
+ { "div", DisplayBlock },
+};
+
+static String htmldata;
+
+static const char *ignorestate, *endtag;
+static int (*getnext)(void);
+
+/* return a space for all data until some case-insensitive string occurs. This
+ is used to parse incorrect HTML/XML that contains unescaped HTML in script
+ or style tags. If you see some </script> tag in a CDATA or comment
+ section then e-mail W3C and tell them the web is too complex. */
+static inline int
+getnext_ignore(void)
+{
+ int c;
+
+ if ((c = getnext()) == EOF)
+ return EOF;
+
+ if (tolower(c) == tolower((unsigned char)*ignorestate)) {
+ ignorestate++;
+ if (*ignorestate == '\0') {
+ parser.getnext = getnext; /* restore */
+ return c;
+ }
+ } else {
+ ignorestate = endtag;
+ }
+
+ return ' ';
+}
+
+/* Clear string only; don't free, prevents unnecessary reallocation. */
+static void
+string_clear(String *s)
+{
+ if (s->data)
+ s->data[0] = '\0';
+ s->len = 0;
+}
+
+static void
+string_buffer_realloc(String *s, size_t newlen)
+{
+ size_t alloclen;
+
+ for (alloclen = 64; alloclen <= newlen; alloclen *= 2)
+ ;
+ if (!(s->data = realloc(s->data, alloclen)))
+ err(1, "realloc");
+ s->bufsiz = alloclen;
+}
+
+static void
+string_append(String *s, const char *data, size_t len)
+{
+ if (!len)
+ return;
+ /* check if allocation is necesary, don't shrink buffer,
+ * should be more than bufsiz ofcourse. */
+ if (s->len + len >= s->bufsiz)
+ string_buffer_realloc(s, s->len + len + 1);
+ memcpy(s->data + s->len, data, len);
+ s->len += len;
+ s->data[s->len] = '\0';
+}
+
+char *
+estrdup(const char *s)
+{
+ char *p;
+
+ if (!(p = strdup(s)))
+ err(1, "strdup");
+ return p;
+}
+
+void *
+ecalloc(size_t nmemb, size_t size)
+{
+ void *p;
+
+ if (!(p = calloc(nmemb, size)))
+ err(1, "calloc");
+ return p;
+}
+
+static void
+printsafe(const char *s)
+{
+ for (; *s; s++) {
+ switch (*s) {
+ case '\t':
+ case '\n':
+ putchar(*s);
+ break;
+ default:
+ if (!iscntrl((unsigned char)*s))
+ putchar(*s);
+ }
+ }
+}
+
+int
+parseuri(const char *s, struct uri *u, int rel)
+{
+ const char *p = s, *b;
+ char *endptr = NULL;
+ size_t i;
+ unsigned long l;
+
+ u->proto[0] = u->host[0] = u->path[0] = u->port[0] = '\0';
+ if (!*s)
+ return 0;
+
+ /* prefix is "//", don't read protocol, skip to domain parsing */
+ if (!strncmp(p, "//", 2)) {
+ p += 2; /* skip "//" */
+ } else {
+ /* protocol part */
+ for (p = s; *p && (isalpha((unsigned char)*p) || isdigit((unsigned char)*p) ||
+ *p == '+' || *p == '-' || *p == '.'); p++)
+ ;
+ if (!strncmp(p, "://", 3)) {
+ if ((size_t)(p - s) >= sizeof(u->proto))
+ return -1; /* protocol too long */
+ memcpy(u->proto, s, p - s);
+ u->proto[p - s] = '\0';
+ p += 3; /* skip "://" */
+ } else {
+ p = s; /* no protocol format, set to start */
+ /* relative url: read rest as path, else as domain */
+ if (rel)
+ goto readpath;
+ }
+ }
+ /* IPv6 address */
+ if (*p == '[') {
+ /* bracket not found or host too long */
+ if (!(b = strchr(p, ']')) || (size_t)(b - p) < 3 ||
+ (size_t)(b - p) >= sizeof(u->host))
+ return -1;
+ memcpy(u->host, p, b - p + 1);
+ u->host[b - p + 1] = '\0';
+ p = b + 1;
+ } else {
+ /* domain / host part, skip until port, path or end. */
+ if ((i = strcspn(p, ":/")) >= sizeof(u->host))
+ return -1; /* host too long */
+ memcpy(u->host, p, i);
+ u->host[i] = '\0';
+ p = &p[i];
+ }
+ /* port */
+ if (*p == ':') {
+ if ((i = strcspn(++p, "/")) >= sizeof(u->port))
+ return -1; /* port too long */
+ memcpy(u->port, p, i);
+ u->port[i] = '\0';
+ /* check for valid port: range 1 - 65535 */
+ errno = 0;
+ l = strtoul(u->port, &endptr, 10);
+ if (errno || u->port[0] == '\0' || *endptr ||
+ !l || l > 65535)
+ return -1;
+ p = &p[i];
+ }
+readpath:
+ if (u->host[0]) {
+ p = &p[strspn(p, "/")];
+ strlcpy(u->path, "/", sizeof(u->path));
+ } else {
+ /* absolute uri must have a host specified */
+ if (!rel)
+ return -1;
+ }
+ /* treat truncation as an error */
+ if (strlcat(u->path, p, sizeof(u->path)) >= sizeof(u->path))
+ return -1;
+ return 0;
+}
+
+static int
+encodeuri(char *buf, size_t bufsiz, const char *s)
+{
+ static const char *table = "0123456789ABCDEF";
+ size_t i, b;
+
+ for (i = 0, b = 0; s[i]; i++) {
+ if (s[i] == ' ' ||
+ (unsigned char)s[i] > 127 ||
+ iscntrl((unsigned char)s[i])) {
+ if (b + 3 >= bufsiz)
+ return -1;
+ buf[b++] = '%';
+ buf[b++] = table[((unsigned char)s[i] >> 4) & 15];
+ buf[b++] = table[(unsigned char)s[i] & 15];
+ } else if (b < bufsiz) {
+ buf[b++] = s[i];
+ } else {
+ return -1;
+ }
+ }
+ if (b >= bufsiz)
+ return -1;
+ buf[b] = '\0';
+
+ return 0;
+}
+
+/* Get absolute uri; if `link` is relative use `base` to make it absolute.
+ * the returned string in `buf` is uri encoded, see: encodeuri(). */
+int
+absuri(char *buf, size_t bufsiz, const char *link, const char *base)
+{
+ struct uri ulink, ubase;
+ char tmp[4096], *host, *p, *port;
+ int c, r;
+ size_t i;
+
+ buf[0] = '\0';
+ if (parseuri(base, &ubase, 0) == -1 ||
+ parseuri(link, &ulink, 1) == -1 ||
+ (!ulink.host[0] && !ubase.host[0]))
+ return -1;
+
+ if (!strncmp(link, "//", 2)) {
+ host = ulink.host;
+ port = ulink.port;
+ } else {
+ host = ulink.host[0] ? ulink.host : ubase.host;
+ port = ulink.port[0] ? ulink.port : ubase.port;
+ }
+ r = snprintf(tmp, sizeof(tmp), "%s://%s%s%s",
+ ulink.proto[0] ?
+ ulink.proto :
+ (ubase.proto[0] ? ubase.proto : "http"),
+ host,
+ port[0] ? ":" : "",
+ port);
+ if (r < 0 || (size_t)r >= sizeof(tmp))
+ return -1; /* error or truncation */
+
+ /* relative to root */
+ if (!ulink.host[0] && ulink.path[0] != '/') {
+ /* relative to base url path */
+ if (ulink.path[0]) {
+ if ((p = strrchr(ubase.path, '/'))) {
+ /* temporary null-terminate */
+ c = *(++p);
+ *p = '\0';
+ i = strlcat(tmp, ubase.path, sizeof(tmp));
+ *p = c; /* restore */
+ if (i >= sizeof(tmp))
+ return -1;
+ }
+ } else if (strlcat(tmp, ubase.path, sizeof(tmp)) >=
+ sizeof(tmp)) {
+ return -1;
+ }
+ }
+ if (strlcat(tmp, ulink.path, sizeof(tmp)) >= sizeof(tmp))
+ return -1;
+
+ return encodeuri(buf, bufsiz, tmp);
+}
+
+static void
+xmlcdata(XMLParser *p, const char *data, size_t datalen)
+{
+ struct node *cur;
+
+ cur = &nodes[curnode];
+ if (cur->displaytype & DisplayNone)
+ return;
+
+ printsafe(data);
+}
+
+#if 0
+static void
+xmldatastart(XMLParser *p)
+{
+// printf("DEBUG: %s\n", __func__);
+}
+#endif
+
+static void
+xmldataend(XMLParser *p)
+{
+ struct node *cur;
+ char *start, *s, *e;
+
+// printf("DEBUG: %s\n", __func__);
+
+ if (!htmldata.data || !htmldata.len)
+ return;
+
+ cur = &nodes[curnode];
+
+// printf("DEBUG: node: %s, type: %d\n", cur->tag, cur->displaytype);
+
+ if (!cur->displaytype || (cur->displaytype & DisplayNone)) {
+ /* nothing */
+ } else if (cur->displaytype & DisplayPre) {
+ fwrite(htmldata.data, 1, htmldata.len, stdout);
+ } else {
+ start = htmldata.data;
+ e = htmldata.data + htmldata.len;
+
+ /* TODO: better white-space handling, for example if there is only
+ white-space between 2 block elements then it can be ignored. */
+ for (s = start; s < e; s++) {
+ if (*s == '\r') {
+ continue;
+ } else if (isspace((unsigned char)*s)) {
+ if (s == start || !isspace((unsigned char)s[-1]))
+ putchar(' ');
+ } else if (!iscntrl((unsigned char)*s)) {
+ putchar(*s);
+ }
+ }
+ }
+
+ string_clear(&htmldata);
+}
+
+static void
+xmldata(XMLParser *p, const char *data, size_t datalen)
+{
+ struct node *cur;
+
+ cur = &nodes[curnode];
+ if (cur->displaytype & DisplayNone)
+ return;
+
+ string_append(&htmldata, data, datalen);
+}
+
+static void
+xmldataentity(XMLParser *p, const char *data, size_t datalen)
+{
+ struct node *cur;
+ char buf[16];
+ int n;
+
+ cur = &nodes[curnode];
+ if (cur->displaytype & DisplayNone)
+ return;
+
+ /* convert basic XML entities */
+ /* ©, copy table from Links (check license) */
+ /* rsquo, hellip, ndash, lsquo */
+ /* TODO: add to tscrape too */
+ /* TODO: support some more HTML entities */
+ n = xml_entitytostr(data, buf, sizeof(buf));
+ if (n > 0)
+ xmldata(p, buf, (size_t)n);
+ else
+ xmldata(p, data, datalen);
+}
+
+static void
+xmltagstart(XMLParser *x, const char *t, size_t tl)
+{
+ struct node *cur;
+ int i;
+
+// printf("start of tag: %s\n", t);
+
+ if (curnode >= MAX_DEPTH - 2)
+ errx(1, "max tag depth reached: %d\n", curnode);
+ curnode++;
+
+ cur = &nodes[curnode];
+ memset(cur, 0, sizeof(*cur));
+ cur->displaytype = DisplayInline;
+ strlcpy(cur->tag, t, sizeof(cur->tag));
+
+ src[0] = '\0'; /* src, href */
+
+ /* set display type */
+ for (i = 0; i < sizeof(tags) / sizeof(*tags); i++) {
+ if (!strcasecmp(tags[i].tag, t)) {
+ cur->displaytype = tags[i].displaytype;
+// printf("match on tag: %s == %s, displaytype: %d\n",
+// tags[i].tag, t, cur->displaytype);
+ break;
+ }
+ }
+}
+
+static void
+xmltagend(XMLParser *p, const char *t, size_t tl, int isshort)
+{
+ struct node *cur;
+ int i;
+
+ cur = &nodes[curnode];
+
+// printf("DEBUG: end of tag: %s, %d, node tag: %s\n", t, cur->displaytype, cur->tag);
+
+ if (cur->displaytype & DisplayBlock) {
+ fputs("\n", stdout);
+ } else if (cur->displaytype & DisplayPre) {
+ fputs("\n", stdout);
+ } else if (cur->displaytype & DisplayTable) {
+ fputs("\n", stdout);
+ } else if (cur->displaytype & DisplayTableRow) {
+ fputs(" | ", stdout); /* HACK: assume last cell */
+ } else if (cur->displaytype & DisplayList) {
+ fputs("\n", stdout);
+ } else if (cur->displaytype & DisplayListItem) {
+ fputs("\n", stdout);
+ } else if (cur->displaytype & DisplayHeader) {
+ fputs("\n", stdout);
+ if (tl == 2 && t[0] == 'h' && t[1] >= '1' && t[1] <= '6') {
+ if (t[1] >= '3')
+ for (i = 0; i < termwidth; i++)
+ putchar('-');
+ else if (t[1] >= '1')
+ for (i = 0; i < termwidth; i++)
+ putchar('=');
+ putchar('\n');
+ }
+ } else if (!strcasecmp(t, "br")) {
+ fputs("\n", stdout);
+ }
+
+ curnode--;
+}
+
+static void
+xmltagstartparsed(XMLParser *p, const char *t, size_t tl, int isshort)
+{
+ struct node *cur;
+ int i;
+
+ /* temporary replace the callback except the reader and end of tag
+ restore the context once we receive the same ignored tag in the
+ end tag handler */
+ if (!strcasecmp(t, "script")) {
+ ignorestate = endtag = "</script>";
+ getnext = p->getnext; /* for restore */
+ p->getnext = getnext_ignore;
+ return;
+ } else if (!strcasecmp(t, "style")) {
+ ignorestate = endtag = "</style>";
+ getnext = p->getnext; /* for restore */
+ p->getnext = getnext_ignore;
+ return;
+ }
+
+ cur = &nodes[curnode];
+
+#ifdef maybe
+ /* show links as reference at the bottom */
+ if (src[0]) {
+ printf(" [%d]", ++linkcount);
+ if (!strcasecmp(t, "img") || !strcasecmp(t, "video") ||
+ !strcasecmp(t, "audio"))
+ printf("[%s]", t);
+ /* TODO: check allocation */
+ if (!links_head)
+ links_cur = links_head = ecalloc(1, sizeof(*links_head));
+ else
+ links_cur = links_cur->next = ecalloc(1, sizeof(*links_head));
+ links_cur->type = estrdup(t);
+ /* TODO: absuri */
+ links_cur->url = estrdup(src);
+ }
+ src[0] = '\0';
+#endif
+
+#if 0
+ /* show links inline */
+ if (src[0]) {
+ char absurl[1024];
+ if (absuri(absurl, sizeof(absurl), src, basehref) != -1) {
+ if (!strcasecmp(t, "img") || !strcasecmp(t, "video") ||
+ !strcasecmp(t, "audio"))
+ printf("[%s](", t);
+ else
+ printf("[%s](", "link");
+ printsafe(absurl);
+ putchar(')');
+ }
+ }
+#endif
+
+ if (cur->displaytype & DisplayBlock) {
+ fputs("\n", stdout);
+ } else if (cur->displaytype & DisplayHeader) {
+ fputs("\n", stdout);
+ } else if (cur->displaytype & DisplayTableRow) {
+ fputs("\n", stdout);
+ } else if (cur->displaytype & DisplayTableCell) {
+ fputs(" | ", stdout);
+ } else if (cur->displaytype & DisplayList) {
+ fputs("\n", stdout);
+ } else if (cur->displaytype & DisplayListItem) {
+ /* indent nested list items */
+ for (i = curnode; i; i--) {
+ if (nodes[i].displaytype & DisplayListItem)
+ continue;
+ if (nodes[i].displaytype & DisplayList)
+ fputs(" ", stdout);
+ }
+ /* TODO: for <ol>, keep list counter on ol element (parent),
+ support ordered number type only */
+ fputs("* ", stdout);
+ } else if (!strcasecmp(t, "hr")) { /* ruler */
+ for (i = 0; i < termwidth; i++)
+ putchar('-');
+ }
+}
+
+static void
+xmlattr(XMLParser *p, const char *tag, size_t taglen, const char *name,
+ size_t namelen, const char *value, size_t valuelen)
+{
+ if (!strcasecmp(tag, "a") && !strcasecmp(name, "href") && valuelen)
+ strlcpy(src, value, sizeof(src));
+
+ if ((!strcasecmp(tag, "img") || !strcasecmp(tag, "video") ||
+ !strcasecmp(tag, "audio")) &&
+ !strcasecmp(name, "src") && valuelen)
+ strlcpy(src, value, sizeof(src));
+}
+
+#ifdef maybe
+void
+printlinkrefs(void)
+{
+ size_t i;
+
+ printf("\n\nLink references:\n");
+
+ /* TODO: add title attribute or some basic description? */
+ for (i = 1, links_cur = links_head; links_cur; links_cur = links_cur->next, i++)
+ printf("[%zu] - %s (%s)\n", i, links_cur->url, links_cur->type);
+}
+#endif
+
+int
+main(void)
+{
+ if (pledge("stdio", NULL) < 0)
+ err(1, "pledge");
+
+ parser.xmlattr = xmlattr;
+ parser.xmlcdata = xmlcdata;
+ parser.xmldata = xmldata;
+// parser.xmldatastart = xmldatastart;
+ parser.xmldataend = xmldataend;
+ parser.xmldataentity = xmldataentity;
+ parser.xmltagstart = xmltagstart;
+ parser.xmltagend = xmltagend;
+ parser.xmltagstartparsed = xmltagstartparsed;
+
+ parser.getnext = getchar;
+ xml_parse(&parser);
+
+#ifdef maybe
+ printlinkrefs();
+#endif
+ putchar('\n');
+
+ return 0;
+}