codemadness.org

       simplify ignore tags parsing - tscrape - twitter scraper
 (HTM) git clone git://git.codemadness.org/tscrape
 (DIR) Log
 (DIR) Files
 (DIR) Refs
 (DIR) README
 (DIR) LICENSE
       ---
 (DIR) commit cb8ed18e7f5f31e68c9d5ab11a6daa8677af6636
 (DIR) parent 2dc167003132b6d9db8e779f26681c560c07a119
 (HTM) Author: Hiltjo Posthuma <hiltjo@codemadness.org>
       Date:   Sat, 26 Aug 2017 12:43:15 +0200
       
       simplify ignore tags parsing
       
       Diffstat:
         M tscrape.c                           |      79 ++++++++++++-------------------
       
       1 file changed, 30 insertions(+), 49 deletions(-)
       ---
 (DIR) diff --git a/tscrape.c b/tscrape.c
       @@ -38,15 +38,9 @@ static char      classname[256];
        static char      datatime[16];
        static char      itemid[64];
        static char      retweetid[64];
       -static int       isignore, state;
       +static int       state;
        static XMLParser p;
        
       -/* ignored tag, all text between this is interpreted literally and ignored */
       -static char *ignoretags[] = {
       -        "style",
       -        "script",
       -};
       -
        static void
        printtweet(void)
        {
       @@ -100,9 +94,6 @@ html_entitytostr(const char *s, char *buf, size_t bufsiz)
        static void
        xmltagend(XMLParser *x, const char *t, size_t tl, int isshort)
        {
       -        if (isignore)
       -                return;
       -
                if (!strcmp(t, "p"))
                        state &= ~Text;
                else if (!strcmp(t, "span"))
       @@ -112,51 +103,44 @@ xmltagend(XMLParser *x, const char *t, size_t tl, int isshort)
        static void
        xmltagstart(XMLParser *x, const char *t, size_t tl)
        {
       -        int i;
       +        char tmp[64];
       +        int c, i;
        
                classname[0] = '\0';
        
       -        for (i = 0; i < sizeof(ignoretags) / sizeof(*ignoretags); i++) {
       -                if (!strcasecmp(ignoretags[i], t)) {
       -                        isignore = 1;
       +        /* HACK: ignored tag is parsed, hook into reader and read raw data
       +           until literal end tag (without using the normal parser).
       +           process (buffered) as xml[c]data (no entity) */
       +        if (strcasecmp(t, "script") && strcasecmp(t, "style"))
       +                return;
       +
       +startignore:
       +        while ((c = x->getnext()) != EOF) {
       +                if (c == '<')
                                break;
       -                }
                }
       +        if (c == EOF)
       +                return;
       +        if ((c = x->getnext()) != '/')
       +                goto startignore;
       +        for (i = 0; (c = x->getnext()) != EOF; i++) {
       +                if (c == '>')
       +                        break;
       +                if (i + 1 >= sizeof(tmp))
       +                        goto startignore;
       +                tmp[i] = c;
       +        }
       +        tmp[i] = '\0';
       +
       +        /* compare against current ignored tag */
       +        if (strcasecmp(t, tmp))
       +                goto startignore;
        }
        
        static void
        xmltagstartparsed(XMLParser *x, const char *t, size_t tl, int isshort)
        {
       -        char tmp[64];
       -        int c, i;
       -
       -        if (isignore) {
       -                /* HACK: ignored tag is parsed, hook into reader and read raw data
       -                   until literal end tag (without using the normal parser).
       -                   process (buffered) as xml[c]data (no entity) */
       -startignore:
       -                while ((c = x->getnext()) != EOF) {
       -                        if (c == '<')
       -                                break;
       -                }
       -                if (c == EOF)
       -                        return;
       -                if ((c = x->getnext()) != '/')
       -                        goto startignore;
       -                for (i = 0; (c = x->getnext()) != EOF; i++) {
       -                        if (c == '>')
       -                                break;
       -                        if (i + 1 >= sizeof(tmp))
       -                                goto startignore;
       -                        tmp[i] = c;
       -                }
       -                tmp[i] = '\0';
       -
       -                /* compare against current ignored tag */
       -                if (!strcasecmp(t, tmp))
       -                        isignore = 0;
       -                return;
       -        }
       +        int i;
        
                if (!strcmp(t, "p") && isclassmatch(classname, STRP("js-tweet-text"))) {
                        if (state & (Item | Stream | Header))
       @@ -197,9 +181,6 @@ static void
        xmlattr(XMLParser *x, const char *t, size_t tl, const char *a, size_t al,
                const char *v, size_t vl)
        {
       -        if (isignore)
       -                return;
       -
                /* NOTE: assumes classname attribute is set before data-* in current tag */
                if (!state && !strcmp(t, "div") && isclassmatch(classname, STRP("user-actions"))) {
                        if (!strcmp(a, "data-screen-name")) {
       @@ -255,7 +236,7 @@ xmlattrentity(XMLParser *x, const char *t, size_t tl, const char *a, size_t al,
                char buf[16];
                ssize_t len;
        
       -        if (!state || isignore)
       +        if (!state)
                        return;
                if ((len = html_entitytostr(v, buf, sizeof(buf))) > 0)
                        xmlattr(x, t, tl, a, al, buf, (size_t)len);