z3bra.org

       ttesting improve white-space handling - webdump - [FORK] git://git.codemadness.org/webdump
 (HTM) git clone git://git.z3bra.org/webdump.git
 (DIR) Log
 (DIR) Files
 (DIR) Refs
 (DIR) README
 (DIR) LICENSE
       ---
 (DIR) commit 2a56590cbe1c1739171a28d4c30b5b318cb0b364
 (DIR) parent e4a9e2404be2db1687430631e912f1809992a23b
 (HTM) Author: Hiltjo Posthuma <hiltjo@codemadness.org>
       Date:   Sat, 21 Sep 2019 20:02:18 +0200
       
       ttesting improve white-space handling
       
       Diffstat:
         M README                              |       6 +++---
         M TODO                                |       7 +++++--
         M main.c                              |      53 +++++++++++++++++++++----------
       
       3 files changed, 45 insertions(+), 21 deletions(-)
       ---
 (DIR) diff --git a/README b/README
       t@@ -1,15 +1,15 @@
        NOTE! work-in-progress (very slowly).
        
        
       -Text-based webpage viewer
       +Text-based HTML dump
        
        
        Goals / scope:
        
       -The tool will render a webpage only to stdout, similarly like links -dump or
       +The tool will only render HTML to stdout, similarly to links -dump or
        lynx -dump but simpler and more secure.
        
        - It will be usable and secure for rendering HTML mails.
        - No remote resources will be downloaded.
        - Data will be written to stdout only.
       -- No support for Javascript, CSS support, frames or forms.
       +- No support for Javascript, CSS support, frames or form input.
 (DIR) diff --git a/TODO b/TODO
       t@@ -1,13 +1,16 @@
       +- improve/remove duplicate white-space/newlines?
       +- handle whitespace, and tag types properly: atleast: inline-block, inline, block, pre.
        - base href.
          specify and parse relative url, allow to specify base and also parse <base href="">
       -- handle whitespace, and tag types properly: atleast: inline-block, inline, block, pre.
        - detect <link /> to RSS/Atom feed, show as link.
          example: <link rel="alternate" href="atom.xml" type="application/atom+xml" title="Codemadness Atom Feed" />
          or
          <link rel="alternate" title="Tweakers Mixed RSS feed" type="application/rss+xml" href="https://tweakers.net/feeds/mixed.xml">
        - print safe (not certain control chars, except newline, TAB etc).
       -- improve/remove duplicate white-space/newlines?
       +- rework parsing of <script> and <style> with unescaped characters like < and >.
        - <code> should not be treated as a block (<pre> does?)
       +- make the code easy to embed/restructure to make a HTML-to-plain-text converter
       +  for HTML in RSS/Atom feeds.
        - add links as reference, for example on page: http://absmagazin.de/2018 the MP3 urls.
        - add COMPATOBJ for strlcpy and strlcat.
        - write a proper Makefile.
 (DIR) diff --git a/main.c b/main.c
       t@@ -402,37 +402,49 @@ xmlcdata(XMLParser *p, const char *data, size_t datalen)
                printsafe(data);
        }
        
       +#if 0
       +static void
       +xmldatastart(XMLParser *p)
       +{
       +//        printf("DEBUG: %s\n", __func__);
       +}
       +#endif
       +
        static void
        xmldataend(XMLParser *p)
        {
                struct node *cur;
                char *start, *s, *e;
        
       +//        printf("DEBUG: %s\n", __func__);
       +
                if (!htmldata.data || !htmldata.len)
                        return;
        
                cur = &nodes[curnode];
       -        if (cur->displaytype & DisplayNone) {
       +
       +//        printf("DEBUG: node: %s, type: %d\n", cur->tag, cur->displaytype);
       +
       +        if (!cur->displaytype || (cur->displaytype & DisplayNone)) {
                        /* nothing */
                } else if (cur->displaytype & DisplayPre) {
                        fwrite(htmldata.data, 1, htmldata.len, stdout);
                } else {
                        start = htmldata.data;
       -                s = start;
       -                e = s + htmldata.len;
       -                /* TODO: better white-space handling */
       -                for (; s < e; s++) {
       -                        if (isspace((unsigned char)*s)) {
       -                                if (s != start && !isspace((unsigned char)s[-1]))
       +                e = htmldata.data + htmldata.len;
       +
       +                /* TODO: better white-space handling, for example if there is only
       +                   white-space between 2 block elements then it can be ignored. */
       +                for (s = start; s < e; s++) {
       +                        if (*s == '\r') {
       +                                continue;
       +                        } else if (isspace((unsigned char)*s)) {
       +                                if (s == start || !isspace((unsigned char)s[-1]))
                                                putchar(' ');
       -                        } else {
       -                                if (!iscntrl((unsigned char)*s))
       -                                        putchar(*s);
       +                        } else if (!iscntrl((unsigned char)*s)) {
       +                                putchar(*s);
                                }
                        }
       -                if (s != start && e != start && !isspace((unsigned char)s[-1]) &&
       -                    isspace((unsigned char)e[-1]))
       -                        putchar(' ');
                }
        
                string_clear(&htmldata);
       t@@ -479,19 +491,25 @@ xmltagstart(XMLParser *x, const char *t, size_t tl)
                struct node *cur;
                int i;
        
       +//        printf("start of tag: %s\n", t);
       +
                if (curnode >= MAX_DEPTH - 2)
                        errx(1, "max tag depth reached: %d\n", curnode);
                curnode++;
        
                cur = &nodes[curnode];
                memset(cur, 0, sizeof(*cur));
       -        src[0] = '\0'; /* src, href */
       +        cur->displaytype = DisplayInline;
                strlcpy(cur->tag, t, sizeof(cur->tag));
        
       +        src[0] = '\0'; /* src, href */
       +
                /* set display type */
                for (i = 0; i < sizeof(tags) / sizeof(*tags); i++) {
                        if (!strcasecmp(tags[i].tag, t)) {
       -                        cur->displaytype |= tags[i].displaytype;
       +                        cur->displaytype = tags[i].displaytype;
       +//                        printf("match on tag: %s == %s, displaytype: %d\n",
       +//                               tags[i].tag, t, cur->displaytype);
                                break;
                        }
                }
       t@@ -505,6 +523,8 @@ xmltagend(XMLParser *p, const char *t, size_t tl, int isshort)
        
                cur = &nodes[curnode];
        
       +//        printf("DEBUG: end of tag: %s, %d, node tag: %s\n", t, cur->displaytype, cur->tag);
       +
                if (cur->displaytype & DisplayBlock) {
                        fputs("\n", stdout);
                } else if (cur->displaytype & DisplayPre) {
       t@@ -609,7 +629,7 @@ xmltagstartparsed(XMLParser *p, const char *t, size_t tl, int isshort)
                                if (nodes[i].displaytype & DisplayListItem)
                                        continue;
                                if (nodes[i].displaytype & DisplayList)
       -                                fputs("    ", stdout);
       +                                fputs("  ", stdout);
                        }
                        /* TODO: for <ol>, keep list counter on ol element (parent),
                           support ordered number type only */
       t@@ -656,6 +676,7 @@ main(void)
                parser.xmlattr = xmlattr;
                parser.xmlcdata = xmlcdata;
                parser.xmldata = xmldata;
       +//        parser.xmldatastart = xmldatastart;
                parser.xmldataend = xmldataend;
                parser.xmldataentity = xmldataentity;
                parser.xmltagstart = xmltagstart;