ignore incorrect unescaped HTML in <style> or <script> in a better way - grabtitle - stupid HTML title grabber
 (HTM) git clone git://git.codemadness.org/grabtitle
 (DIR) Log
 (DIR) Files
 (DIR) Refs
 (DIR) README
 (DIR) LICENSE
       ---
 (DIR) commit d908478d0f84bc275428fd71e934c993bb29211c
 (DIR) parent 0cca681092b680c5b80da62771d47fa383be6cd1
 (HTM) Author: Hiltjo Posthuma <hiltjo@codemadness.org>
       Date:   Mon, 10 Dec 2018 19:01:58 +0100
       
       ignore incorrect unescaped HTML in <style> or <script> in a better way
       
       this way we can still use a (mostly) XML parser for HTML data.
       
       Diffstat:
         M grabtitle.c                         |      71 +++++++++++++++++++------------
       
       1 file changed, 44 insertions(+), 27 deletions(-)
       ---
 (DIR) diff --git a/grabtitle.c b/grabtitle.c
       @@ -16,28 +16,38 @@
        #endif
        
        static XMLParser parser;
       -static int istitle, ignore;
       -
       -static void
       -xmltagstart(XMLParser *p, const char *t, size_t tl)
       +static const char *state, *endtag;
       +static int (*getnext)(void);
       +
       +/* return a space for all data until some case-insensitive string occurs. This
       +   is used to parse incorrect HTML/XML that contains unescaped HTML in script
       +   or style tags. */
       +static inline int
       +getchar_ignore(void)
        {
       -        if ((tl == 6 && !strcasecmp(t, "script")) ||
       -            (tl == 5 && !strcasecmp(t, "style")))
       -                ignore = 1;
       -        if (!ignore && tl == 5 && !strcasecmp(t, "title"))
       -                istitle = 1;
       +        int c;
       +
       +        if ((c = getnext()) == EOF)
       +                return EOF;
       +
       +        if (tolower(c) == tolower((unsigned char)*state)) {
       +                state++;
       +                if (*state == '\0') {
       +                        parser.getnext = getnext; /* restore */
       +                        return c;
       +                }
       +        } else {
       +                state = endtag;
       +        }
       +
       +        return ' ';
        }
        
        static void
        xmltagend(XMLParser *p, const char *t, size_t tl, int isshort)
        {
       -        if (ignore && ((tl == 6 && !strcasecmp(t, "script")) ||
       -            (tl == 5 && !strcasecmp(t, "style"))))
       -                ignore = 0;
       -        if (istitle && tl == 5 && !strcasecmp(t, "title")) {
       -                putchar('\n');
       -                exit(0);
       -        }
       +        putchar('\n');
       +        exit(0);
        }
        
        /* data and CDATA */
       @@ -46,9 +56,6 @@ xmldata(XMLParser *p, const char *d, size_t dl)
        {
                size_t i;
        
       -        if (!istitle)
       -                return;
       -
                for (i = 0; *d && i < dl; i++, d++) {
                        if (iscntrl((unsigned char)*d))
                                putchar(' ');
       @@ -63,15 +70,30 @@ xmldataentity(XMLParser *p, const char *d, size_t dl)
                char buf[16];
                ssize_t len;
        
       -        if (!istitle)
       -                return;
       -
                if ((len = xml_entitytostr(d, buf, sizeof(buf))))
                        xmldata(p, buf, (size_t)len);
                else
                        xmldata(p, d, dl);
        }
        
       +static void
       +xmltagstart(XMLParser *p, const char *t, size_t tl)
       +{
       +        if (tl == 6 && !strcasecmp(t, "script")) {
       +                state = endtag = "</script>";
       +                getnext = p->getnext; /* for restore */
       +                p->getnext = getchar_ignore;
       +        } else if (tl == 5 && !strcasecmp(t, "style")) {
       +                state = endtag = "</style>";
       +                getnext = p->getnext; /* for restore */
       +                p->getnext = getchar_ignore;
       +        } else if (tl == 5 && !strcasecmp(t, "title")) {
       +                p->xmltagend = xmltagend;
       +                p->xmlcdata = p->xmldata = xmldata;
       +                p->xmldataentity = xmldataentity;
       +        }
       +}
       +
        int
        main(int argc, char *argv[])
        {
       @@ -81,11 +103,6 @@ main(int argc, char *argv[])
                }
        
                parser.xmltagstart = xmltagstart;
       -        parser.xmltagend = xmltagend;
       -        parser.xmldata = xmldata;
       -        parser.xmlcdata = xmldata;
       -        parser.xmldataentity = xmldataentity;
       -
                parser.getnext = getchar;
                xml_parse(&parser);