timprove whitespace handling (needs more work) - webdump - [FORK] git://git.codemadness.org/webdump
 (HTM) git clone git://git.z3bra.org/webdump.git
 (DIR) Log
 (DIR) Files
 (DIR) Refs
 (DIR) README
 (DIR) LICENSE
       ---
 (DIR) commit ea14e82082be78917aaa6e380879c0e230330b47
 (DIR) parent 421341e1a2b737cb269a144a1634511705161651
 (HTM) Author: Hiltjo Posthuma <hiltjo@codemadness.org>
       Date:   Sat, 22 Jul 2017 23:49:01 +0200
       
       improve whitespace handling (needs more work)
       
       Diffstat:
         M Makefile                            |       2 +-
         M main.c                              |     108 ++++++++++++++++++++++---------
       
       2 files changed, 78 insertions(+), 32 deletions(-)
       ---
 (DIR) diff --git a/Makefile b/Makefile
       t@@ -1,5 +1,5 @@
        build: clean
       -        cc xml.c main.c -o main
       +        cc -ggdb -O0 -Wall xml.c main.c -o main
        
        clean:
                rm -f main *.o
 (DIR) diff --git a/main.c b/main.c
       t@@ -10,10 +10,9 @@
        #include "xml.h"
        
        /* string and size */
       -#define STRP(s) s,sizeof(s)-1
       +/*#define STRP(s) s,sizeof(s)-1*/
        
        static XMLParser parser;
       -static int isdatastart;
        
        struct node {
                char tag[256];
       t@@ -23,6 +22,15 @@ struct node {
                int isblock;
        };
        
       +typedef struct node Node;
       +
       +/* String data / memory pool */
       +typedef struct string {
       +        char   *data;   /* data */
       +        size_t  len;    /* string length */
       +        size_t  bufsiz; /* allocated size */
       +} String;
       +
        static char src[4096]; /* src or href attribute */
        
        #define MAX_DEPTH 256
       t@@ -70,58 +78,96 @@ static char *blocktags[] = {
                "table",
        };
        
       +static String htmldata;
       +
       +/* Clear string only; don't free, prevents unnecessary reallocation. */
        static void
       -xmlcdata(XMLParser *p, const char *data, size_t datalen)
       +string_clear(String *s)
        {
       -        fputs(data, stdout);
       +        if (s->data)
       +                s->data[0] = '\0';
       +        s->len = 0;
        }
        
        static void
       -xmldatastart(XMLParser *p)
       +string_buffer_realloc(String *s, size_t newlen)
        {
       -        isdatastart = 1;
       +        size_t alloclen;
       +
       +        for (alloclen = 64; alloclen <= newlen; alloclen *= 2)
       +                ;
       +        if (!(s->data = realloc(s->data, alloclen)))
       +                err(1, "realloc");
       +        s->bufsiz = alloclen;
        }
        
        static void
       -xmldataend(XMLParser *p)
       +string_append(String *s, const char *data, size_t len)
        {
       -        isdatastart = 0;
       +        if (!len)
       +                return;
       +        /* check if allocation is necesary, don't shrink buffer,
       +         * should be more than bufsiz ofcourse. */
       +        if (s->len + len >= s->bufsiz)
       +                string_buffer_realloc(s, s->len + len + 1);
       +        memcpy(s->data + s->len, data, len);
       +        s->len += len;
       +        s->data[s->len] = '\0';
        }
        
        static void
       -xmldata(XMLParser *p, const char *data, size_t datalen)
       +xmlcdata(XMLParser *p, const char *data, size_t datalen)
       +{
       +        fputs(data, stdout);
       +}
       +
       +static void
       +xmldataend(XMLParser *p)
        {
                struct node *cur;
       -        const char *s = data;
       +        char *start, *s, *e;
        
                cur = &nodes[curnode];
       -        if (cur->isignore)
       -                goto end;
        
       -        /* TODO: if not <pre> or w/e, skip? */
       -        if (isdatastart && isspace(*s)) {
       -                for (s++; *s; s++) {
       -                        if (!isspace(*s))
       -                                break;
       -                }
       -                putchar(' ');
       -        }
       +        start = htmldata.data;
       +        for (s = start; *s; s++)
       +                if (*s != '\r' && *s != '\n')
       +                        break;
       +
       +        e = s + strlen(s);
       +        for (; e > s; e--)
       +                if (*e != '\r' && *e != '\n')
       +                        break;
        
                if (cur->ispre) {
       -                for (; *s; s++)
       -                        putchar(*s);
       +                fwrite(s, 1, e - s, stdout);
                } else {
       -                for (; *s; s++) {
       -                        if (isspace(*s))
       -                                putchar(' ');
       -                        else
       +                for (; s < e; s++) {
       +                        if (!isspace(*s))
       +                                break;
       +                }
       +                for (; s < e; s++) {
       +                        if (!isspace(*s)) {
       +                                if (s != start && isspace(s[-1]))
       +                                        putchar(' ');
                                        putchar(*s);
       +                        }
                        }
                }
        
       -end:
       -        /* TODO: remove trailing space also ? */
       -        isdatastart = 0;
       +        string_clear(&htmldata);
       +}
       +
       +static void
       +xmldata(XMLParser *p, const char *data, size_t datalen)
       +{
       +        struct node *cur;
       +
       +        cur = &nodes[curnode];
       +        if (cur->isignore)
       +                return;
       +
       +        string_append(&htmldata, data, datalen);
        }
        
        static void
       t@@ -136,7 +182,7 @@ xmldataentity(XMLParser *p, const char *data, size_t datalen)
                if (n <= 0)
                        xmldata(p, data, datalen);
                else
       -                fputs(buf, stdout);
       +                string_append(&htmldata, buf, n);
        }
        
        static void
       t@@ -254,6 +300,7 @@ xmlattr(XMLParser *p, const char *tag, size_t taglen, const char *name,
                        strlcpy(src, value, sizeof(src));
        }
        
       +/* TODO: preprocess data, strip <script>, <style> etc */
        int
        main(void)
        {
       t@@ -262,7 +309,6 @@ main(void)
        
                parser.xmlattr = xmlattr;
                parser.xmlcdata = xmlcdata;
       -        parser.xmldatastart = xmldatastart;
                parser.xmldata = xmldata;
                parser.xmldataend = xmldataend;
                parser.xmldataentity = xmldataentity;