timprovements - webdump - [FORK] git://git.codemadness.org/webdump
(HTM) git clone git://git.z3bra.org/webdump.git
(DIR) Log
(DIR) Files
(DIR) Refs
(DIR) README
(DIR) LICENSE
---
(DIR) commit b86543ca5fdf0639730cfe957866904abbf398f1
(DIR) parent 3bebb5f7caaf7ac19c3aa58571856ee27e2e418c
(HTM) Author: Hiltjo Posthuma <hiltjo@codemadness.org>
Date: Wed, 20 Nov 2019 18:24:24 +0100
improvements
- Add a soft line-wrapping mode: tries to wraps words to termwidth.
It can still be longer if the word is longer.
- Handle ordered and unordered lists and logic for numbering of nodes:
list items, table rows, table cells, etc.
- Handle properly. Output the UTF-8 codepoint.
- Much improved rendering of white-spaces (still needs some tweaks here and
there).
- Print <pre> content safely (no control-characters).
- Fix printing of ruler (<hr/>).
Diffstat:
M TODO | 40 +++++++++++++++++++++++++++++--
M webdump.c | 251 ++++++++++++++++++++-----------
M xml.c | 2 +-
3 files changed, 205 insertions(+), 88 deletions(-)
---
(DIR) diff --git a/TODO b/TODO
t@@ -1,6 +1,37 @@
-- improve/remove duplicate white-space/newlines?
+<div>
+<span>test</span>
+
+abc <- rendered as block?
+
+<span>test</span>
+
+</div>
+
+see lobsters.html
+
+
+
+
+? if isatty(1) then ioctl width and try this width for rulers, etc?
+
+
+- printing of data in nested elements in <pre> (with markup).
+
+
+- keep count of rows per table (parent), cells per row (parent).
+
+
+- div in li:
+<ul>
+<li><div>test</div></li>
+<li><div>test</div></li>
+</ul>
+
+
+maybe merge some changes back from z3bra (2f30):
+https://git.z3bra.org/webdump/log.html
+
- handle whitespace, and tag types properly: atleast: inline-block, inline, block, pre.
- - still handle as non-breaking-space, ^.
- base href.
specify and parse relative url, allow to specify base and also parse <base href="">
- show link to <frame>/<frameset>/<iframe>.
t@@ -19,6 +50,11 @@
- write a proper Makefile.
- write documentation and man pages.
- cleanup code.
+
+x ordered lists.
+x improve/remove duplicate white-space/newlines?
+x still handle as non-breaking-space, ^: ignore printc rule.
+
? word-wrapping.
? aligned/fancy table cell rendering.
? xml.c: make sure to always call xmldata handler even if datalen == 0 ?
(DIR) diff --git a/webdump.c b/webdump.c
t@@ -30,6 +30,8 @@ struct uri {
static int termwidth = 72;
+//#define LINKREFS
+
#ifdef LINKREFS
/* linked-list of link references */
struct linkref {
t@@ -51,19 +53,27 @@ enum DisplayType {
DisplayInlineBlock = 8,
DisplayBlock = 16,
DisplayList = 32,
- DisplayListItem = 64,
- DisplayTable = 128,
- DisplayTableRow = 256,
- DisplayTableCell = 512,
- DisplayHeader = 1024,
+ DisplayListOrdered = 64,
+ DisplayListItem = 128,
+ DisplayTable = 256,
+ DisplayTableRow = 512,
+ DisplayTableCell = 1024,
+ DisplayHeader = 2048,
+};
+
+struct tag {
+ char *name;
+ enum DisplayType displaytype; // TODO: use struct tag reference
+ enum DisplayType parenttype;
};
struct node {
- char tag[256];
- enum DisplayType displaytype;
+ char tagname[256];
+ struct tag tag;
+ size_t count; // TODO: rename: childnodes or w/e
};
-typedef struct node Node;
+typedef struct node Node; // TODO: remove
/* String data / memory pool */
typedef struct string {
t@@ -79,14 +89,22 @@ static char *basehref = "";
static char src[4096]; /* src or href attribute */
+static String htmldata;
+
+/* for white-space output handling:
+ 1 = whitespace emitted (oppress repeated), 2 = other characters on this line
+ Behaviour:
+ * White-space data before non-whitespace data in tags are ignored on a line.
+ * Repeated white-space are ignored: a single space (' ') is emitted.
+*/
+static int whitespace_mode = 0;
+static size_t ncharsline = 0;
+
#define MAX_DEPTH 256
static struct node nodes[MAX_DEPTH];
static int curnode;
-static struct {
- char *tag;
- enum DisplayType displaytype;
-} tags[] = {
+static struct tag tags[] = {
/* pre */
{ "pre", DisplayPre },
{ "code", DisplayPre },
t@@ -105,12 +123,12 @@ static struct {
/* table */
{ "table", DisplayTable },
/* table-row */
- { "tr", DisplayTableRow },
+ { "tr", DisplayTableRow, DisplayTable },
/* table-cell */
- { "td", DisplayTableCell },
- { "th", DisplayTableCell },
+ { "td", DisplayTableCell, DisplayTableRow },
+ { "th", DisplayTableCell, DisplayTableRow },
/* list-item */
- { "li", DisplayListItem },
+ { "li", DisplayListItem, DisplayList },
/* header */
{ "h1", DisplayHeader },
{ "h2", DisplayHeader },
t@@ -122,7 +140,7 @@ static struct {
{ "br", 0 },
/* list */
{ "ul", DisplayList },
- { "ol", DisplayList },
+ { "ol", DisplayList|DisplayListOrdered },
/* block */
{ "p", DisplayBlock },
{ "blockquote", DisplayBlock },
t@@ -136,8 +154,6 @@ static struct {
{ "div", DisplayBlock },
};
-static String htmldata;
-
static const char *ignorestate, *endtag;
static int (*getnext)(void);
t@@ -222,9 +238,60 @@ ecalloc(size_t nmemb, size_t size)
}
static void
-printsafe(const char *s)
+newline(void)
+{
+ putchar('\n');
+ whitespace_mode &= ~2; /* no characters on this line yet */
+ ncharsline = 0;
+}
+
+/* print one character safely */
+static void
+printc(int c)
+{
+ if (isspace(c)) {
+ whitespace_mode |= 1;
+ } else {
+ if (whitespace_mode == 3) {
+ putchar(' ');
+ ncharsline++;
+ /* DEBUG: soft line-wrapping on white-space */
+ /* TODO: better line-wrapping */
+ if (ncharsline > termwidth)
+ newline();
+ }
+
+ whitespace_mode = 2;
+ if (!iscntrl(c)) {
+ putchar(c);
+ ncharsline++;
+ }
+ }
+}
+
+/* Find nearest parent node belonging to type. For example a listitem -> list */
+static struct node *
+findparentoftype(int cur)
+{
+ int i;
+
+ if (!nodes[cur].tag.parenttype)
+ return NULL;
+
+ for (i = cur; i; i--) {
+ if ((nodes[i].tag.displaytype & nodes[cur].tag.parenttype))
+ return &nodes[i];
+ }
+
+ return NULL;
+}
+
+static void
+printsafe(const char *s, size_t len)
{
- for (; *s; s++) {
+ size_t i;
+
+ for (i = 0; *s && i < len; s++, i++) {
switch (*s) {
case '\t':
case '\n':
t@@ -407,10 +474,10 @@ xmlcdata(XMLParser *p, const char *data, size_t datalen)
struct node *cur;
cur = &nodes[curnode];
- if (cur->displaytype & DisplayNone)
+ if (cur->tag.displaytype & DisplayNone)
return;
- printsafe(data);
+ printsafe(data, datalen);
}
static void
t@@ -419,35 +486,23 @@ xmldataend(XMLParser *p)
struct node *cur;
char *start, *s, *e;
-// printf("DEBUG: %s\n", __func__);
-
if (!htmldata.data || !htmldata.len)
return;
cur = &nodes[curnode];
-// printf("DEBUG: node: %s, type: %d\n", cur->tag, cur->displaytype);
+// printf("DEBUG: node: %s, type: %d\n", cur->tagname, cur->tag.displaytype);
- if (!cur->displaytype || (cur->displaytype & DisplayNone)) {
+ if (cur->tag.displaytype == DisplayUnknown || (cur->tag.displaytype & DisplayNone)) {
/* nothing */
- } else if (cur->displaytype & DisplayPre) {
- fwrite(htmldata.data, 1, htmldata.len, stdout);
+ } else if (cur->tag.displaytype & DisplayPre) {
+ printsafe(htmldata.data, htmldata.len);
} else {
start = htmldata.data;
e = htmldata.data + htmldata.len;
- /* TODO: better white-space handling, for example if there is only
- white-space between 2 block elements then it can be ignored. */
- for (s = start; s < e; s++) {
- if (*s == '\r') {
- continue;
- } else if (isspace((unsigned char)*s)) {
- if (s == start || !isspace((unsigned char)s[-1]))
- putchar(' ');
- } else if (!iscntrl((unsigned char)*s)) {
- putchar(*s);
- }
- }
+ for (s = start; s < e; s++)
+ printc((unsigned char)*s);
}
string_clear(&htmldata);
t@@ -459,7 +514,7 @@ xmldata(XMLParser *p, const char *data, size_t datalen)
struct node *cur;
cur = &nodes[curnode];
- if (cur->displaytype & DisplayNone)
+ if (cur->tag.displaytype & DisplayNone)
return;
string_append(&htmldata, data, datalen);
t@@ -473,15 +528,23 @@ xmldataentity(XMLParser *p, const char *data, size_t datalen)
int n;
cur = &nodes[curnode];
- if (cur->displaytype & DisplayNone)
+ if (cur->tag.displaytype & DisplayNone)
return;
- /* convert basic XML entities */
/* ©, copy table from Links (check license) */
/* rsquo, hellip, ndash, lsquo */
/* TODO: add to tscrape too */
/* TODO: support some more HTML entities */
- n = xml_entitytostr(data, buf, sizeof(buf));
+
+ n = 0;
+ if (!strcmp(data, " ") || !strcmp(data, "&NBSP;")) {
+ memcpy(buf, "\xc2\xa0", 3); /* UTF-8: nbsp */
+ n = 2;
+ }
+
+ /* convert basic XML entities */
+ if (n <= 0)
+ n = xml_entitytostr(data, buf, sizeof(buf));
if (n > 0)
xmldata(p, buf, (size_t)n);
else
t@@ -502,17 +565,20 @@ xmltagstart(XMLParser *x, const char *t, size_t tl)
cur = &nodes[curnode];
memset(cur, 0, sizeof(*cur));
- cur->displaytype = DisplayInline;
- strlcpy(cur->tag, t, sizeof(cur->tag));
+ cur->tag.displaytype = DisplayInline;
+ strlcpy(cur->tagname, t, sizeof(cur->tagname));
src[0] = '\0'; /* src, href */
/* set display type */
for (i = 0; i < sizeof(tags) / sizeof(*tags); i++) {
- if (!strcasecmp(tags[i].tag, t)) {
- cur->displaytype = tags[i].displaytype;
+ if (!strcasecmp(tags[i].name, t)) {
+ cur->count = 0;
+ memcpy(&(cur->tag), &tags[i], sizeof(tags[i]));
+
+// cur->displaytype = tags[i].displaytype;
// printf("DEBUG: match on tag: %s == %s, displaytype: %d\n",
-// tags[i].tag, t, cur->displaytype);
+// tags[i].tag, t, cur->tag.displaytype);
break;
}
}
t@@ -526,22 +592,23 @@ xmltagend(XMLParser *p, const char *t, size_t tl, int isshort)
cur = &nodes[curnode];
-// printf("DEBUG: end of tag: %s, %d, node tag: %s\n", t, cur->displaytype, cur->tag);
+// printf("DEBUG: end of tag: %s, %d, node tag: %s\n", t,
+// cur->tag.displaytype, cur->tagname);
- if (cur->displaytype & DisplayBlock) {
- fputs("\n", stdout);
- } else if (cur->displaytype & DisplayPre) {
- fputs("\n", stdout);
- } else if (cur->displaytype & DisplayTable) {
- fputs("\n", stdout);
- } else if (cur->displaytype & DisplayTableRow) {
+ if (cur->tag.displaytype & DisplayBlock) {
+ newline();
+ } else if (cur->tag.displaytype & DisplayPre) {
+ newline();
+ } else if (cur->tag.displaytype & DisplayTable) {
+ newline();
+ } else if (cur->tag.displaytype & DisplayTableRow) {
fputs(" | ", stdout); /* HACK: assume last cell */
- } else if (cur->displaytype & DisplayList) {
- fputs("\n", stdout);
- } else if (cur->displaytype & DisplayListItem) {
- fputs("\n", stdout);
- } else if (cur->displaytype & DisplayHeader) {
- fputs("\n", stdout);
+ } else if (cur->tag.displaytype & DisplayList) {
+ newline();
+ } else if (cur->tag.displaytype & DisplayListItem) {
+ newline();
+ } else if (cur->tag.displaytype & DisplayHeader) {
+ newline();
if (tl == 2 && t[0] == 'h' && t[1] >= '1' && t[1] <= '6') {
if (t[1] >= '3')
for (i = 0; i < termwidth; i++)
t@@ -549,10 +616,13 @@ xmltagend(XMLParser *p, const char *t, size_t tl, int isshort)
else if (t[1] >= '1')
for (i = 0; i < termwidth; i++)
putchar('=');
- putchar('\n');
+ newline();
}
- } else if (!strcasecmp(t, "br")) {
- fputs("\n", stdout);
+ }
+
+ /* specific tag handling */
+ if (!strcasecmp(t, "br")) {
+ newline();
}
curnode--;
t@@ -561,7 +631,7 @@ xmltagend(XMLParser *p, const char *t, size_t tl, int isshort)
static void
xmltagstartparsed(XMLParser *p, const char *t, size_t tl, int isshort)
{
- struct node *cur;
+ struct node *cur, *parent = NULL;
int i;
/* temporary replace the callback except the reader and end of tag
t@@ -615,34 +685,45 @@ xmltagstartparsed(XMLParser *p, const char *t, size_t tl, int isshort)
printf("[%s](", t);
else
printf("[%s](", "link");
- printsafe(absurl);
+ printsafe(absurl, strlen(absurl));
putchar(')');
}
}
#endif
- if (cur->displaytype & DisplayBlock) {
- fputs("\n", stdout);
- } else if (cur->displaytype & DisplayHeader) {
- fputs("\n", stdout);
- } else if (cur->displaytype & DisplayTableRow) {
- fputs("\n", stdout);
- } else if (cur->displaytype & DisplayTableCell) {
+ /* find first parent node of type and increase child node count */
+ if (cur->tag.parenttype && (parent = findparentoftype(curnode)))
+ parent->count++;
+
+ if (cur->tag.displaytype & DisplayBlock) {
+ newline();
+ } else if (cur->tag.displaytype & DisplayHeader) {
+ newline();
+ } else if (cur->tag.displaytype & DisplayTableRow) {
+ newline();
+ } else if (cur->tag.displaytype & DisplayList) {
+ newline();
+ } else if (cur->tag.displaytype & DisplayTableCell) {
fputs(" | ", stdout);
- } else if (cur->displaytype & DisplayList) {
- fputs("\n", stdout);
- } else if (cur->displaytype & DisplayListItem) {
+ } else if (cur->tag.displaytype & DisplayListItem) {
/* indent nested list items */
for (i = curnode; i; i--) {
- if (nodes[i].displaytype & DisplayListItem)
+ if (nodes[i].tag.displaytype & DisplayListItem)
continue;
- if (nodes[i].displaytype & DisplayList)
+ if (nodes[i].tag.displaytype & DisplayList)
fputs(" ", stdout);
}
- /* TODO: for <ol>, keep list counter on ol element (parent),
- support ordered number type only */
- fputs("* ", stdout);
- } else if (!strcasecmp(t, "hr")) { /* ruler */
+ /* find first parent node and ordered numbers or unordered */
+ if (parent) {
+ if (parent->tag.displaytype & DisplayListOrdered)
+ printf("%zu. ", parent->count);
+ else
+ fputs("* ", stdout);
+ }
+ }
+
+ /* specific tag handling */
+ if (!strcasecmp(t, "hr")) { /* ruler */
for (i = 0; i < termwidth; i++)
putchar('-');
}
(DIR) diff --git a/xml.c b/xml.c
t@@ -259,7 +259,7 @@ namedentitytostr(const char *e, char *buf, size_t bufsiz)
{ "LT;", '<' },
{ "GT;", '>' },
{ "APOS;", '\'' },
- { "QUOT;", '"' }
+ { "QUOT;", '"' },
};
size_t i;