timprovements - webdump - [FORK] git://git.codemadness.org/webdump
 (HTM) git clone git://git.z3bra.org/webdump.git
 (DIR) Log
 (DIR) Files
 (DIR) Refs
 (DIR) README
 (DIR) LICENSE
       ---
 (DIR) commit b86543ca5fdf0639730cfe957866904abbf398f1
 (DIR) parent 3bebb5f7caaf7ac19c3aa58571856ee27e2e418c
 (HTM) Author: Hiltjo Posthuma <hiltjo@codemadness.org>
       Date:   Wed, 20 Nov 2019 18:24:24 +0100
       
       improvements
       
       - Add a soft line-wrapping mode: tries to wraps words to termwidth.
         It can still be longer if the word is longer.
       - Handle ordered and unordered lists and logic for numbering of nodes:
         list items, table rows, table cells, etc.
       - Handle &nbsp; properly. Output the UTF-8 codepoint.
       - Much improved rendering of white-spaces (still needs some tweaks here and
         there).
       - Print <pre> content safely (no control-characters).
       - Fix printing of ruler (<hr/>).
       
       Diffstat:
         M TODO                                |      40 +++++++++++++++++++++++++++++--
         M webdump.c                           |     251 ++++++++++++++++++++-----------
         M xml.c                               |       2 +-
       
       3 files changed, 205 insertions(+), 88 deletions(-)
       ---
 (DIR) diff --git a/TODO b/TODO
       t@@ -1,6 +1,37 @@
       -- improve/remove duplicate white-space/newlines?
       +<div>
       +<span>test</span>
       +
       +abc                              <- rendered as block?
       +
       +<span>test</span>
       +
       +</div>
       +
       +see lobsters.html
       +
       +
       +
       +
       +? if isatty(1) then ioctl width and try this width for rulers, etc?
       +
       +
       +- printing of data in nested elements in <pre> (with markup).
       +
       +
       +- keep count of rows per table (parent), cells per row (parent).
       +
       +
       +- div in li:
       +<ul>
       +<li><div>test</div></li>
       +<li><div>test</div></li>
       +</ul>
       +
       +
       +maybe merge some changes back from z3bra (2f30):
       +https://git.z3bra.org/webdump/log.html
       +
        - handle whitespace, and tag types properly: atleast: inline-block, inline, block, pre.
       -  - still handle &nbsp; as non-breaking-space, ^.
        - base href.
          specify and parse relative url, allow to specify base and also parse <base href="">
        - show link to <frame>/<frameset>/<iframe>.
       t@@ -19,6 +50,11 @@
        - write a proper Makefile.
        - write documentation and man pages.
        - cleanup code.
       +
       +x ordered lists.
       +x improve/remove duplicate white-space/newlines?
       +x still handle &nbsp; as non-breaking-space, ^: ignore printc rule.
       +
        ? word-wrapping.
        ? aligned/fancy table cell rendering.
        ? xml.c: make sure to always call xmldata handler even if datalen == 0 ?
 (DIR) diff --git a/webdump.c b/webdump.c
       t@@ -30,6 +30,8 @@ struct uri {
        
        static int termwidth = 72;
        
       +//#define LINKREFS
       +
        #ifdef LINKREFS
        /* linked-list of link references */
        struct linkref {
       t@@ -51,19 +53,27 @@ enum DisplayType {
                DisplayInlineBlock = 8,
                DisplayBlock       = 16,
                DisplayList        = 32,
       -        DisplayListItem    = 64,
       -        DisplayTable       = 128,
       -        DisplayTableRow    = 256,
       -        DisplayTableCell   = 512,
       -        DisplayHeader      = 1024,
       +        DisplayListOrdered = 64,
       +        DisplayListItem    = 128,
       +        DisplayTable       = 256,
       +        DisplayTableRow    = 512,
       +        DisplayTableCell   = 1024,
       +        DisplayHeader      = 2048,
       +};
       +
       +struct tag {
       +        char *name;
       +        enum DisplayType displaytype; // TODO: use struct tag reference
       +        enum DisplayType parenttype;
        };
        
        struct node {
       -        char tag[256];
       -        enum DisplayType displaytype;
       +        char tagname[256];
       +        struct tag tag;
       +        size_t count; // TODO: rename: childnodes or w/e
        };
        
       -typedef struct node Node;
       +typedef struct node Node; // TODO: remove
        
        /* String data / memory pool */
        typedef struct string {
       t@@ -79,14 +89,22 @@ static char *basehref = "";
        
        static char src[4096]; /* src or href attribute */
        
       +static String htmldata;
       +
       +/* for white-space output handling:
       +   1 = whitespace emitted (oppress repeated), 2 = other characters on this line
       +   Behaviour:
       +   * White-space data before non-whitespace data in tags are ignored on a line.
       +   * Repeated white-space are ignored: a single space (' ') is emitted.
       +*/
       +static int whitespace_mode = 0;
       +static size_t ncharsline = 0;
       +
        #define MAX_DEPTH 256
        static struct node nodes[MAX_DEPTH];
        static int curnode;
        
       -static struct {
       -        char *tag;
       -        enum DisplayType displaytype;
       -} tags[] = {
       +static struct tag tags[] = {
                /* pre */
                { "pre", DisplayPre },
                { "code", DisplayPre },
       t@@ -105,12 +123,12 @@ static struct {
                /* table */
                { "table", DisplayTable },
                /* table-row */
       -        { "tr", DisplayTableRow },
       +        { "tr", DisplayTableRow, DisplayTable },
                /* table-cell */
       -        { "td", DisplayTableCell },
       -        { "th", DisplayTableCell },
       +        { "td", DisplayTableCell, DisplayTableRow },
       +        { "th", DisplayTableCell, DisplayTableRow },
                /* list-item */
       -        { "li", DisplayListItem },
       +        { "li", DisplayListItem, DisplayList },
                /* header */
                { "h1", DisplayHeader },
                { "h2", DisplayHeader },
       t@@ -122,7 +140,7 @@ static struct {
                { "br", 0 },
                /* list */
                { "ul", DisplayList },
       -        { "ol", DisplayList },
       +        { "ol", DisplayList|DisplayListOrdered },
                /* block */
                { "p", DisplayBlock },
                { "blockquote", DisplayBlock },
       t@@ -136,8 +154,6 @@ static struct {
                { "div", DisplayBlock },
        };
        
       -static String htmldata;
       -
        static const char *ignorestate, *endtag;
        static int (*getnext)(void);
        
       t@@ -222,9 +238,60 @@ ecalloc(size_t nmemb, size_t size)
        }
        
        static void
       -printsafe(const char *s)
       +newline(void)
       +{
       +        putchar('\n');
       +        whitespace_mode &= ~2; /* no characters on this line yet */
       +        ncharsline = 0;
       +}
       +
       +/* print one character safely */
       +static void
       +printc(int c)
       +{
       +        if (isspace(c)) {
       +                whitespace_mode |= 1;
       +        } else {
       +                if (whitespace_mode == 3) {
       +                        putchar(' ');
       +                        ncharsline++;
       +                        /* DEBUG: soft line-wrapping on white-space */
       +                        /* TODO: better line-wrapping */
       +                        if (ncharsline > termwidth)
       +                                newline();
       +                }
       +
       +                whitespace_mode = 2;
       +                if (!iscntrl(c)) {
       +                        putchar(c);
       +                        ncharsline++;
       +                }
       +        }
       +}
       +
       +/* Find nearest parent node belonging to type. For example a listitem -> list */
       +static struct node *
       +findparentoftype(int cur)
       +{
       +        int i;
       +
       +        if (!nodes[cur].tag.parenttype)
       +                return NULL;
       +
       +        for (i = cur; i; i--) {
       +                if ((nodes[i].tag.displaytype & nodes[cur].tag.parenttype))
       +                        return &nodes[i];
       +        }
       +
       +        return NULL;
       +}
       +
       +static void
       +printsafe(const char *s, size_t len)
        {
       -        for (; *s; s++) {
       +        size_t i;
       +
       +        for (i = 0; *s && i < len; s++, i++) {
                        switch (*s) {
                        case '\t':
                        case '\n':
       t@@ -407,10 +474,10 @@ xmlcdata(XMLParser *p, const char *data, size_t datalen)
                struct node *cur;
        
                cur = &nodes[curnode];
       -        if (cur->displaytype & DisplayNone)
       +        if (cur->tag.displaytype & DisplayNone)
                        return;
        
       -        printsafe(data);
       +        printsafe(data, datalen);
        }
        
        static void
       t@@ -419,35 +486,23 @@ xmldataend(XMLParser *p)
                struct node *cur;
                char *start, *s, *e;
        
       -//        printf("DEBUG: %s\n", __func__);
       -
                if (!htmldata.data || !htmldata.len)
                        return;
        
                cur = &nodes[curnode];
        
       -//        printf("DEBUG: node: %s, type: %d\n", cur->tag, cur->displaytype);
       +//        printf("DEBUG: node: %s, type: %d\n", cur->tagname, cur->tag.displaytype);
        
       -        if (!cur->displaytype || (cur->displaytype & DisplayNone)) {
       +        if (cur->tag.displaytype == DisplayUnknown || (cur->tag.displaytype & DisplayNone)) {
                        /* nothing */
       -        } else if (cur->displaytype & DisplayPre) {
       -                fwrite(htmldata.data, 1, htmldata.len, stdout);
       +        } else if (cur->tag.displaytype & DisplayPre) {
       +                printsafe(htmldata.data, htmldata.len);
                } else {
                        start = htmldata.data;
                        e = htmldata.data + htmldata.len;
        
       -                /* TODO: better white-space handling, for example if there is only
       -                   white-space between 2 block elements then it can be ignored. */
       -                for (s = start; s < e; s++) {
       -                        if (*s == '\r') {
       -                                continue;
       -                        } else if (isspace((unsigned char)*s)) {
       -                                if (s == start || !isspace((unsigned char)s[-1]))
       -                                        putchar(' ');
       -                        } else if (!iscntrl((unsigned char)*s)) {
       -                                putchar(*s);
       -                        }
       -                }
       +                for (s = start; s < e; s++)
       +                        printc((unsigned char)*s);
                }
        
                string_clear(&htmldata);
       t@@ -459,7 +514,7 @@ xmldata(XMLParser *p, const char *data, size_t datalen)
                struct node *cur;
        
                cur = &nodes[curnode];
       -        if (cur->displaytype & DisplayNone)
       +        if (cur->tag.displaytype & DisplayNone)
                        return;
        
                string_append(&htmldata, data, datalen);
       t@@ -473,15 +528,23 @@ xmldataentity(XMLParser *p, const char *data, size_t datalen)
                int n;
        
                cur = &nodes[curnode];
       -        if (cur->displaytype & DisplayNone)
       +        if (cur->tag.displaytype & DisplayNone)
                        return;
        
       -        /* convert basic XML entities */
                /* &nbsp; &copy;, copy table from Links (check license) */
                /* rsquo, hellip, ndash, lsquo */
                /* TODO: add to tscrape too */
                /* TODO: support some more HTML entities */
       -        n = xml_entitytostr(data, buf, sizeof(buf));
       +
       +        n = 0;
       +        if (!strcmp(data, "&nbsp;") || !strcmp(data, "&NBSP;")) {
       +                memcpy(buf, "\xc2\xa0", 3); /* UTF-8: nbsp */
       +                n = 2;
       +        }
       +
       +        /* convert basic XML entities */
       +        if (n <= 0)
       +                n = xml_entitytostr(data, buf, sizeof(buf));
                if (n > 0)
                        xmldata(p, buf, (size_t)n);
                else
       t@@ -502,17 +565,20 @@ xmltagstart(XMLParser *x, const char *t, size_t tl)
        
                cur = &nodes[curnode];
                memset(cur, 0, sizeof(*cur));
       -        cur->displaytype = DisplayInline;
       -        strlcpy(cur->tag, t, sizeof(cur->tag));
       +        cur->tag.displaytype = DisplayInline;
       +        strlcpy(cur->tagname, t, sizeof(cur->tagname));
        
                src[0] = '\0'; /* src, href */
        
                /* set display type */
                for (i = 0; i < sizeof(tags) / sizeof(*tags); i++) {
       -                if (!strcasecmp(tags[i].tag, t)) {
       -                        cur->displaytype = tags[i].displaytype;
       +                if (!strcasecmp(tags[i].name, t)) {
       +                        cur->count = 0;
       +                        memcpy(&(cur->tag), &tags[i], sizeof(tags[i]));
       +
       +//                        cur->displaytype = tags[i].displaytype;
        //                        printf("DEBUG: match on tag: %s == %s, displaytype: %d\n",
       -//                               tags[i].tag, t, cur->displaytype);
       +//                               tags[i].tag, t, cur->tag.displaytype);
                                break;
                        }
                }
       t@@ -526,22 +592,23 @@ xmltagend(XMLParser *p, const char *t, size_t tl, int isshort)
        
                cur = &nodes[curnode];
        
       -//        printf("DEBUG: end of tag: %s, %d, node tag: %s\n", t, cur->displaytype, cur->tag);
       +//        printf("DEBUG: end of tag: %s, %d, node tag: %s\n", t,
       +//               cur->tag.displaytype, cur->tagname);
        
       -        if (cur->displaytype & DisplayBlock) {
       -                fputs("\n", stdout);
       -        } else if (cur->displaytype & DisplayPre) {
       -                fputs("\n", stdout);
       -        } else if (cur->displaytype & DisplayTable) {
       -                fputs("\n", stdout);
       -        } else if (cur->displaytype & DisplayTableRow) {
       +        if (cur->tag.displaytype & DisplayBlock) {
       +                newline();
       +        } else if (cur->tag.displaytype & DisplayPre) {
       +                newline();
       +        } else if (cur->tag.displaytype & DisplayTable) {
       +                newline();
       +        } else if (cur->tag.displaytype & DisplayTableRow) {
                        fputs(" | ", stdout); /* HACK: assume last cell */
       -        } else if (cur->displaytype & DisplayList) {
       -                fputs("\n", stdout);
       -        } else if (cur->displaytype & DisplayListItem) {
       -                fputs("\n", stdout);
       -        } else if (cur->displaytype & DisplayHeader) {
       -                fputs("\n", stdout);
       +        } else if (cur->tag.displaytype & DisplayList) {
       +                newline();
       +        } else if (cur->tag.displaytype & DisplayListItem) {
       +                newline();
       +        } else if (cur->tag.displaytype & DisplayHeader) {
       +                newline();
                        if (tl == 2 && t[0] == 'h' && t[1] >= '1' && t[1] <= '6') {
                                if (t[1] >= '3')
                                        for (i = 0; i < termwidth; i++)
       t@@ -549,10 +616,13 @@ xmltagend(XMLParser *p, const char *t, size_t tl, int isshort)
                                else if (t[1] >= '1')
                                        for (i = 0; i < termwidth; i++)
                                                putchar('=');
       -                        putchar('\n');
       +                        newline();
                        }
       -        } else if (!strcasecmp(t, "br")) {
       -                fputs("\n", stdout);
       +        }
       +
       +        /* specific tag handling */
       +        if (!strcasecmp(t, "br")) {
       +                newline();
                }
        
                curnode--;
       t@@ -561,7 +631,7 @@ xmltagend(XMLParser *p, const char *t, size_t tl, int isshort)
        static void
        xmltagstartparsed(XMLParser *p, const char *t, size_t tl, int isshort)
        {
       -        struct node *cur;
       +        struct node *cur, *parent = NULL;
                int i;
        
                /* temporary replace the callback except the reader and end of tag
       t@@ -615,34 +685,45 @@ xmltagstartparsed(XMLParser *p, const char *t, size_t tl, int isshort)
                                        printf("[%s](", t);
                                else
                                        printf("[%s](", "link");
       -                        printsafe(absurl);
       +                        printsafe(absurl, strlen(absurl));
                                putchar(')');
                        }
                }
        #endif
        
       -        if (cur->displaytype & DisplayBlock) {
       -                fputs("\n", stdout);
       -        } else if (cur->displaytype & DisplayHeader) {
       -                fputs("\n", stdout);
       -        } else if (cur->displaytype & DisplayTableRow) {
       -                fputs("\n", stdout);
       -        } else if (cur->displaytype & DisplayTableCell) {
       +        /* find first parent node of type and increase child node count */
       +        if (cur->tag.parenttype && (parent = findparentoftype(curnode)))
       +                parent->count++;
       +
       +        if (cur->tag.displaytype & DisplayBlock) {
       +                newline();
       +        } else if (cur->tag.displaytype & DisplayHeader) {
       +                newline();
       +        } else if (cur->tag.displaytype & DisplayTableRow) {
       +                newline();
       +        } else if (cur->tag.displaytype & DisplayList) {
       +                newline();
       +        } else if (cur->tag.displaytype & DisplayTableCell) {
                        fputs(" | ", stdout);
       -        } else if (cur->displaytype & DisplayList) {
       -                fputs("\n", stdout);
       -        } else if (cur->displaytype & DisplayListItem) {
       +        } else if (cur->tag.displaytype & DisplayListItem) {
                        /* indent nested list items */
                        for (i = curnode; i; i--) {
       -                        if (nodes[i].displaytype & DisplayListItem)
       +                        if (nodes[i].tag.displaytype & DisplayListItem)
                                        continue;
       -                        if (nodes[i].displaytype & DisplayList)
       +                        if (nodes[i].tag.displaytype & DisplayList)
                                        fputs("  ", stdout);
                        }
       -                /* TODO: for <ol>, keep list counter on ol element (parent),
       -                   support ordered number type only */
       -                fputs("* ", stdout);
       -        } else if (!strcasecmp(t, "hr")) { /* ruler */
       +                /* find first parent node and ordered numbers or unordered */
       +                if (parent) {
       +                        if (parent->tag.displaytype & DisplayListOrdered)
       +                                printf("%zu. ", parent->count);
       +                        else
       +                                fputs("* ", stdout);
       +                }
       +        }
       +
       +        /* specific tag handling */
       +        if (!strcasecmp(t, "hr")) { /* ruler */
                        for (i = 0; i < termwidth; i++)
                                putchar('-');
                }
 (DIR) diff --git a/xml.c b/xml.c
       t@@ -259,7 +259,7 @@ namedentitytostr(const char *e, char *buf, size_t bufsiz)
                        { "LT;",   '<'  },
                        { "GT;",   '>'  },
                        { "APOS;", '\'' },
       -                { "QUOT;", '"'  }
       +                { "QUOT;", '"'  },
                };
                size_t i;