codemadness.org

       initial support to ignore literals in <script> and <style> - tscrape - twitter scraper
 (HTM) git clone git://git.codemadness.org/tscrape
 (DIR) Log
 (DIR) Files
 (DIR) Refs
 (DIR) README
 (DIR) LICENSE
       ---
 (DIR) commit 1ff56f1ce94cd62b0c16ee343917435c9048b8b8
 (DIR) parent 006a11c3aced38fa2cc3915793c1b9e886d0ad41
 (HTM) Author: Hiltjo Posthuma <hiltjo@codemadness.org>
       Date:   Fri, 25 Aug 2017 17:44:37 +0200
       
       initial support to ignore literals in <script> and <style>
       
       Diffstat:
         M tscrape.c                           |      70 +++++++++++++++++++++++++++----
       
       1 file changed, 61 insertions(+), 9 deletions(-)
       ---
 (DIR) diff --git a/tscrape.c b/tscrape.c
       @@ -38,9 +38,15 @@ static char      classname[256];
        static char      datatime[16];
        static char      itemid[64];
        static char      retweetid[64];
       -static int       state;
       +static int       isignore, state;
        static XMLParser p;
        
       +/* ignored tag, all text between this is interpreted literally and ignored */
       +static char *ignoretags[] = {
       +        "style",
       +        "script",
       +};
       +
        static void
        printtweet(void)
        {
       @@ -94,6 +100,9 @@ html_entitytostr(const char *s, char *buf, size_t bufsiz)
        static void
        xmltagend(XMLParser *x, const char *t, size_t tl, int isshort)
        {
       +        if (isignore)
       +                return;
       +
                if (!strcmp(t, "p"))
                        state &= ~Text;
                else if (!strcmp(t, "span"))
       @@ -103,35 +112,78 @@ xmltagend(XMLParser *x, const char *t, size_t tl, int isshort)
        static void
        xmltagstart(XMLParser *x, const char *t, size_t tl)
        {
       +        int i;
       +
                classname[0] = '\0';
       +
       +        for (i = 0; i < sizeof(ignoretags) / sizeof(*ignoretags); i++) {
       +                if (!strcasecmp(ignoretags[i], t)) {
       +                        isignore = 1;
       +                        break;
       +                }
       +        }
        }
        
        static void
        xmltagstartparsed(XMLParser *x, const char *t, size_t tl, int isshort)
        {
       -        const char *c = classname;
       +        char tmp[64];
       +        int c, i;
       +
       +        if (isignore) {
       +                /* HACK: ignored tag is parsed, hook into reader and read raw data
       +                   until literal end tag (without using the normal parser).
       +                   process (buffered) as xml[c]data (no entity) */
       +startignore:
       +                while ((c = x->getnext()) != EOF) {
       +                        if (c == '<')
       +                                break;
       +                }
       +                if (c == EOF)
       +                        return;
       +                if ((c = x->getnext()) != '/')
       +                        goto startignore;
       +                for (i = 0; (c = x->getnext()) != EOF; i++) {
       +                        if (c == '>')
       +                                break;
       +                        if (i + 1 >= sizeof(tmp))
       +                                goto startignore;
       +                        tmp[i] = c;
       +                }
       +                tmp[i] = '\0';
       +
       +                /* compare against current ignored tag */
       +                if (!strcasecmp(t, tmp))
       +                        isignore = 0;
       +                return;
       +        }
        
       -        if (!strcmp(t, "p") && isclassmatch(c, STRP("js-tweet-text"))) {
       +        if (!strcmp(t, "p") && isclassmatch(classname, STRP("js-tweet-text"))) {
                        if (state & (Item | Stream | Header))
                                state |= Text;
       -        } else if (!strcmp(t, "div") && isclassmatch(c, STRP("stream-item-footer"))) {
       +        } else if (!strcmp(t, "div") &&
       +                   isclassmatch(classname, STRP("stream-item-footer"))) {
                        if (text[0] && username[0])
                                printtweet();
                        state = 0;
       -        } else if (!strcmp(t, "li") && isclassmatch(c, STRP("js-stream-item"))) {
       +        } else if (!strcmp(t, "li") &&
       +                   isclassmatch(classname, STRP("js-stream-item"))) {
                        state |= Item;
                        datatime[0] = text[0] = timestamp[0] = itemfullname[0] = '\0';
                        itemid[0] = itemusername[0] = retweetid[0] = '\0';
                        ispinned = 0;
       -                if (isclassmatch(c, STRP("js-pinned")))
       +                if (isclassmatch(classname, STRP("js-pinned")))
                                ispinned = 1;
                } else if (state & Item) {
       -                if (!strcmp(t, "div") && isclassmatch(c, STRP("js-stream-tweet"))) {
       +                if (!strcmp(t, "div") &&
       +                    isclassmatch(classname, STRP("js-stream-tweet"))) {
                                state &= ~(Text|Header);
                                state |= Stream;
       -                } else if (!strcmp(t, "a") && isclassmatch(c, STRP("js-action-profile"))) {
       +                } else if (!strcmp(t, "a") &&
       +                           isclassmatch(classname, STRP("js-action-profile"))) {
                                state |= Header;
       -                } else if (!strcmp(t, "span") && isclassmatch(c, STRP("js-short-timestamp"))) {
       +                } else if (!strcmp(t, "span") &&
       +                          isclassmatch(classname, STRP("js-short-timestamp"))) {
                                state |= Timestamp;
                                strlcpy(timestamp, datatime, sizeof(timestamp));
                                datatime[0] = '\0';