codemadness.org

       ignore incorrect unescaped HTML in <style> or <script> in a better way - tscrape - twitter scraper
 (HTM) git clone git://git.codemadness.org/tscrape
 (DIR) Log
 (DIR) Files
 (DIR) Refs
 (DIR) README
 (DIR) LICENSE
       ---
 (DIR) commit ed3a979265abe557e783ea22c6a09fb96241ff95
 (DIR) parent 0fac9621c44b76c38d911438b1966d665e3b8134
 (HTM) Author: Hiltjo Posthuma <hiltjo@codemadness.org>
       Date:   Mon, 17 Dec 2018 18:32:50 +0100
       
       ignore incorrect unescaped HTML in <style> or <script> in a better way
       
       Diffstat:
         M tscrape.c                           |      53 +++++++++++++++++++++----------
       
       1 file changed, 37 insertions(+), 16 deletions(-)
       ---
 (DIR) diff --git a/tscrape.c b/tscrape.c
       @@ -40,6 +40,34 @@ static char      retweetid[64];
        static int       state;
        static XMLParser p;
        
       +static const char *ignorestate, *endtag;
       +static int (*getnext)(void);
       +
       +/* return a space for all data until some case-insensitive string occurs. This
       +   is used to parse incorrect HTML/XML that contains unescaped HTML in script
       +   or style tags. If you see some </script> tag in a CDATA or comment
       +   section then e-mail W3C and tell them the web is too complex. */
       +static inline int
       +getchar_ignore(void)
       +{
       +        int c;
       +
       +        if ((c = getnext()) == EOF)
       +                return EOF;
       +
       +        if (tolower(c) == tolower((unsigned char)*ignorestate)) {
       +                ignorestate++;
       +                if (*ignorestate == '\0') {
       +                        p.getnext = getnext; /* restore */
       +                        return c;
       +                }
       +        } else {
       +                ignorestate = endtag;
       +        }
       +
       +        return ' ';
       +}
       +
        static void
        printtweet(void)
        {
       @@ -100,16 +128,6 @@ xmltagend(XMLParser *x, const char *t, size_t tl, int isshort)
                        state &= ~(Timestamp);
        }
        
       -static char ignoretag[8];
       -static XMLParser xo; /* old context */
       -
       -static void
       -xmlignoretagend(XMLParser *p, const char *t, size_t tl, int isshort)
       -{
       -        if (!strcasecmp(t, ignoretag))
       -                memcpy(p, &xo, sizeof(*p)); /* restore context */
       -}
       -
        static void
        xmltagstart(XMLParser *x, const char *t, size_t tl)
        {
       @@ -122,12 +140,15 @@ xmltagstartparsed(XMLParser *x, const char *t, size_t tl, int isshort)
                /* temporary replace the callback except the reader and end of tag
                   restore the context once we receive the same ignored tag in the
                   end tag handler */
       -        if (!strcasecmp(t, "script") || !strcasecmp(t, "style")) {
       -                strlcpy(ignoretag, t, sizeof(ignoretag));
       -                memcpy(&xo, x, sizeof(xo)); /* store old context */
       -                memset(x, 0, sizeof(*x));
       -                x->xmltagend = xmlignoretagend;
       -                x->getnext = xo.getnext;
       +        if (!strcasecmp(t, "script")) {
       +                ignorestate = endtag = "</script>";
       +                getnext = x->getnext; /* for restore */
       +                x->getnext = getchar_ignore;
       +                return;
       +        } else if (!strcasecmp(t, "style")) {
       +                ignorestate = endtag = "</style>";
       +                getnext = x->getnext; /* for restore */
       +                x->getnext = getchar_ignore;
                        return;
                }