tremove preprocess code, compare tags and attribute case-insensitive - webdump - [FORK] git://git.codemadness.org/webdump
 (HTM) git clone git://git.z3bra.org/webdump.git
 (DIR) Log
 (DIR) Files
 (DIR) Refs
 (DIR) README
 (DIR) LICENSE
       ---
 (DIR) commit 54f38abd3722c07e900820343e7c5288c6b0fdce
 (DIR) parent dacc8c21011cdd6f6c9dc4ebd177478b2151a2c1
 (HTM) Author: Hiltjo Posthuma <hiltjo@codemadness.org>
       Date:   Sun, 20 Aug 2017 20:56:39 +0200
       
       remove preprocess code, compare tags and attribute case-insensitive
       
       idea to ignore literal tags (HTML).
       
       Diffstat:
         M main.c                              |     103 ++++++++++++-------------------
       
       1 file changed, 40 insertions(+), 63 deletions(-)
       ---
 (DIR) diff --git a/main.c b/main.c
       t@@ -19,7 +19,7 @@ static XMLParser parser;
        
        struct node {
                char tag[256];
       -/*        int isignore;*/
       +        int isignore;
                int ispre;
                int isinline;
                int isblock;
       t@@ -85,7 +85,6 @@ static char *blocktags[] = {
        };
        
        static String htmldata;
       -static String preprocess;
        
        /* Clear string only; don't free, prevents unnecessary reallocation. */
        static void
       t@@ -204,8 +203,8 @@ xmldata(XMLParser *p, const char *data, size_t datalen)
        
                cur = &nodes[curnode];
                string_append(&htmldata, data, datalen);
       -/*        if (cur->isignore)
       -                return;*/
       +        if (cur->isignore)
       +                return;
        }
        
        static void
       t@@ -233,28 +232,28 @@ xmltagstart(XMLParser *p, const char *tag, size_t taglen)
                src[0] = '\0'; /* src, href */
                strlcpy(cur->tag, tag, sizeof(cur->tag));
        
       -#if 0
       +#if 1
                for (i = 0; i < sizeof(ignoretags) / sizeof(*ignoretags); i++) {
       -                if (!strcmp(ignoretags[i], tag)) {
       +                if (!strcasecmp(ignoretags[i], tag)) {
                                cur->isignore = 1;
                                break;
                        }
                }
        #endif
                for (i = 0; i < sizeof(pretags) / sizeof(*pretags); i++) {
       -                if (!strcmp(pretags[i], tag)) {
       +                if (!strcasecmp(pretags[i], tag)) {
                                cur->ispre = 1;
                                break;
                        }
                }
                for (i = 0; i < sizeof(blocktags) / sizeof(*blocktags); i++) {
       -                if (!strcmp(blocktags[i], tag)) {
       +                if (!strcasecmp(blocktags[i], tag)) {
                                cur->isblock = 1;
                                break;
                        }
                }
                for (i = 0; i < sizeof(inlinetags) / sizeof(*inlinetags); i++) {
       -                if (!strcmp(inlinetags[i], tag)) {
       +                if (!strcasecmp(inlinetags[i], tag)) {
                                cur->isinline = 1;
                                break;
                        }
       t@@ -270,8 +269,8 @@ xmltagend(XMLParser *p, const char *tag, size_t taglen, int isshort)
                if (curnode)
                        curnode--;
                cur = &nodes[curnode];
       -/*        if (cur->isignore)
       -                return;*/
       +        if (cur->isignore)
       +                return;
        
        #if 0
                if (src[0])
       t@@ -279,7 +278,7 @@ xmltagend(XMLParser *p, const char *tag, size_t taglen, int isshort)
                src[0] = '\0';
        #endif
        
       -        if (!strcmp(tag, "tr"))
       +        if (!strcasecmp(tag, "tr"))
                        fputs(" | ", stdout); /* HACK */
        
                if (cur->isblock)
       t@@ -300,31 +299,47 @@ static void
        xmltagstartparsed(XMLParser *p, const char *tag, size_t taglen, int isshort)
        {
                struct node *cur;
       -        int i;
       +        int c, i;
        
                cur = &nodes[curnode];
       -/*        if (cur->isignore)
       -                return;*/
       +        if (cur->isignore) {
       +#if 0
       +                /* HACK: ignored tag is parsed, hook into reader and read raw data
       +                   until literal end tag (without using the normal parser). */
       +                   
       +                /* TODO: process (buffered) as xml[c]data (no entity) */
       +                while ((c = getchar()) != EOF) {
       +                        if (c == '<') {
       +                                /* TODO: check /endtag */
       +                                break;
       +                        }
       +                }
       +                if (c == EOF) {
       +                }
       +
       +#endif
       +                return;
       +        }
        
                if (cur->isblock)
                        fputs("\n", stdout);
        
       -        if (!strcmp(tag, "td") || !strcmp(tag, "th"))
       +        if (!strcasecmp(tag, "td") || !strcasecmp(tag, "th"))
                        fputs(" | ", stdout); /* HACK */
        
       -        if (!strcmp(cur->tag, "li")) {
       +        if (!strcasecmp(cur->tag, "li")) {
                        /* indent nested list items */
                        for (i = curnode; i; i--) {
       -                        if (!strcmp(nodes[i].tag, "li"))
       +                        if (!strcasecmp(nodes[i].tag, "li"))
                                        continue;
       -                        if (!strcmp(nodes[i].tag, "ul") ||
       -                            !strcmp(nodes[i].tag, "ol"))
       +                        if (!strcasecmp(nodes[i].tag, "ul") ||
       +                            !strcasecmp(nodes[i].tag, "ol"))
                                        fputs("    ", stdout);
                        }
                        /* TODO: for <ol>, keep list counter on ol element (parent),
                           support ordered number type only */
                        fputs("* ", stdout);
       -        } else if (!strcmp(cur->tag, "hr")) {
       +        } else if (!strcasecmp(cur->tag, "hr")) {
                        for (i = 0; i < 36; i++)
                                putchar('-');
                }
       t@@ -338,65 +353,27 @@ static void
        xmlattr(XMLParser *p, const char *tag, size_t taglen, const char *name,
                size_t namelen, const char *value, size_t valuelen)
        {
       -        if (!strcmp(tag, "a") && !strcmp(name, "href") && valuelen)
       +        if (!strcasecmp(tag, "a") && !strcasecmp(name, "href") && valuelen)
                        strlcpy(src, value, sizeof(src));
        
       -        if ((!strcmp(tag, "img") || !strcmp(tag, "video") || !strcmp(tag, "audio")) &&
       -            !strcmp(name, "src") && valuelen)
       +        if ((!strcasecmp(tag, "img") || !strcasecmp(tag, "video") ||
       +             !strcasecmp(tag, "audio")) &&
       +            !strcasecmp(name, "src") && valuelen)
                        strlcpy(src, value, sizeof(src));
        }
        
       -/*static size_t readoffset;*/
       -
        int
        readchar(void)
        {
                return getchar();
       -#if 0
       -        size_t i, j;
       -        int c;
       -        
       -        for (; readoffset < preprocess.len; ) {
       -                if (preprocess.data[read_offset] != '<')
       -                        return preprocess.data[read_offset++];
       -                        
       -                for (j = 0; j < sizeof(ignoretags) / sizeof(*ignoretags); j++) {
       -                        if (!strncmp(&preprocess.data[i + 1], ignoretags[i], sizeof(ignoretags[i]) - 1)) {
       -                                if (strchr(" \t>", preprocess.data[i + 1 + sizeof(ignoretags[i]) - 1])) {
       -                                        /* TODO: search until end of this tag */
       -                                }
       -                        }
       -                }
       -                /* TODO: if no match just return char */
       -                return preprocess.data[read_offset++];
       -        }
       -        return EOF;
       -#endif
        }
        
       -/* TODO: preprocess data, strip <script>, <style> etc */
        int
        main(void)
        {
       -
       -        char buf[BUFSIZ];
       -        int n;
       -
                if (pledge("stdio", NULL) < 0)
                        err(1, "pledge");
        
       -#if 0
       -        /* TODO: optimize later */
       -        while (1) {
       -                /* TODO: check read error */
       -                n = read(0, buf, sizeof(buf) - 1);
       -                if (n <= 0)
       -                        break;
       -                buf[n] = '\0';
       -                string_append(&preprocess, buf, n);
       -        }
       -#endif
       -        
                parser.xmlattr = xmlattr;
                parser.xmlcdata = xmlcdata;
                parser.xmldata = xmldata;