tignore all within <script> or <style> (WIP) - webdump - [FORK] git://git.codemadness.org/webdump
 (HTM) git clone git://git.z3bra.org/webdump.git
 (DIR) Log
 (DIR) Files
 (DIR) Refs
 (DIR) README
 (DIR) LICENSE
       ---
 (DIR) commit ec68d5635764887d323bc7e3e09c01fda411e865
 (DIR) parent 26361ccd0ab0f19276d7727b8f589b1109cfbfd1
 (HTM) Author: Hiltjo Posthuma <hiltjo@codemadness.org>
       Date:   Thu, 27 Jun 2019 19:37:07 +0200
       
       ignore all within <script> or <style> (WIP)
       
       Diffstat:
         M main.c                              |      59 +++++++++++++++++++++----------
       
       1 file changed, 40 insertions(+), 19 deletions(-)
       ---
 (DIR) diff --git a/main.c b/main.c
       t@@ -89,6 +89,34 @@ static char *blocktags[] = {
        
        static String htmldata;
        
       +static const char *ignorestate, *endtag;
       +static int (*getnext)(void);
       +
       +/* return a space for all data until some case-insensitive string occurs. This
       +   is used to parse incorrect HTML/XML that contains unescaped HTML in script
       +   or style tags. If you see some </script> tag in a CDATA or comment
       +   section then e-mail W3C and tell them the web is too complex. */
       +static inline int
       +getnext_ignore(void)
       +{
       +        int c;
       +
       +        if ((c = getnext()) == EOF)
       +                return EOF;
       +
       +        if (tolower(c) == tolower((unsigned char)*ignorestate)) {
       +                ignorestate++;
       +                if (*ignorestate == '\0') {
       +                        parser.getnext = getnext; /* restore */
       +                        return c;
       +                }
       +        } else {
       +                ignorestate = endtag;
       +        }
       +
       +        return ' ';
       +}
       +
        /* Clear string only; don't free, prevents unnecessary reallocation. */
        static void
        string_clear(String *s)
       t@@ -218,10 +246,10 @@ xmldataentity(XMLParser *p, const char *data, size_t datalen)
                /* TODO: add to tscrape too */
                /* TODO: support some more HTML entities */
                n = xml_entitytostr(data, buf, sizeof(buf));
       -        if (n <= 0)
       -                xmldata(p, data, datalen);
       +        if (n > 0)
       +                xmldata(p, buf, (size_t)n);
                else
       -                string_append(&htmldata, buf, n);
       +                xmldata(p, data, datalen);
        }
        
        static void
       t@@ -282,16 +310,6 @@ xmltagend(XMLParser *p, const char *t, size_t tl, int isshort)
                }
        }
        
       -static char ignoretag[8];
       -static XMLParser xo; /* old context */
       -
       -static void
       -xmlignoretagend(XMLParser *p, const char *t, size_t tl, int isshort)
       -{
       -        if (!strcasecmp(t, ignoretag))
       -                memcpy(p, &xo, sizeof(*p)); /* restore context */
       -}
       -
        static void
        xmltagstartparsed(XMLParser *p, const char *t, size_t tl, int isshort)
        {
       t@@ -301,12 +319,15 @@ xmltagstartparsed(XMLParser *p, const char *t, size_t tl, int isshort)
                /* temporary replace the callback except the reader and end of tag
                   restore the context once we receive the same ignored tag in the
                   end tag handler */
       -        if (!strcasecmp(t, "script") || !strcasecmp(t, "style")) {
       -                strlcpy(ignoretag, t, sizeof(ignoretag));
       -                memcpy(&xo, p, sizeof(xo)); /* store old context */
       -                memset(p, 0, sizeof(*p));
       -                p->xmltagend = xmlignoretagend;
       -                p->getnext = xo.getnext;
       +        if (!strcasecmp(t, "script")) {
       +                ignorestate = endtag = "</script>";
       +                getnext = p->getnext; /* for restore */
       +                p->getnext = getnext_ignore;
       +                return;
       +        } else if (!strcasecmp(t, "style")) {
       +                ignorestate = endtag = "</style>";
       +                getnext = p->getnext; /* for restore */
       +                p->getnext = getnext_ignore;
                        return;
                }