improve and simplify ignore tag handling - tscrape - twitter scraper
 (HTM) git clone git://git.codemadness.org/tscrape
 (DIR) Log
 (DIR) Files
 (DIR) Refs
 (DIR) README
 (DIR) LICENSE
       ---
 (DIR) commit 7789dc04f4937dd68677a953320537b3da519f3b
 (DIR) parent e3bd0af8ac5af175c7dee7c24eadf238f5f4334f
 (HTM) Author: Hiltjo Posthuma <hiltjo@codemadness.org>
       Date:   Sat, 26 Aug 2017 15:36:10 +0200
       
       improve and simplify ignore tag handling
       
       Diffstat:
         M tscrape.c                           |      53 +++++++++++++------------------
       
       1 file changed, 22 insertions(+), 31 deletions(-)
       ---
 (DIR) diff --git a/tscrape.c b/tscrape.c
       @@ -100,41 +100,20 @@ xmltagend(XMLParser *x, const char *t, size_t tl, int isshort)
                        state &= ~(Timestamp);
        }
        
       +static char ignoretag[8];
       +static XMLParser xo; /* old context */
       +
        static void
       -xmltagstart(XMLParser *x, const char *t, size_t tl)
       +xmlignoretagend(XMLParser *p, const char *t, size_t tl, int isshort)
        {
       -        char tmp[64];
       -        int c, i;
       +        if (!strcasecmp(t, ignoretag))
       +                memcpy(p, &xo, sizeof(*p)); /* restore context */
       +}
        
       +static void
       +xmltagstart(XMLParser *x, const char *t, size_t tl)
       +{
                classname[0] = '\0';
       -
       -        /* HACK: ignored tag is parsed, hook into reader and read raw data
       -           until literal end tag (without using the normal parser).
       -           process (buffered) as xml[c]data (no entity) */
       -        if (strcasecmp(t, "script") && strcasecmp(t, "style"))
       -                return;
       -
       -startignore:
       -        while ((c = x->getnext()) != EOF) {
       -                if (c == '<')
       -                        break;
       -        }
       -        if (c == EOF)
       -                return;
       -        if ((c = x->getnext()) != '/')
       -                goto startignore;
       -        for (i = 0; (c = x->getnext()) != EOF; i++) {
       -                if (c == '>')
       -                        break;
       -                if (i + 1 >= sizeof(tmp))
       -                        goto startignore;
       -                tmp[i] = c;
       -        }
       -        tmp[i] = '\0';
       -
       -        /* compare against current ignored tag */
       -        if (strcasecmp(t, tmp))
       -                goto startignore;
        }
        
        static void
       @@ -142,6 +121,18 @@ xmltagstartparsed(XMLParser *x, const char *t, size_t tl, int isshort)
        {
                int i;
        
       +        /* temporary replace the callback except the reader and end of tag
       +           restore the context once we receive the same ignored tag in the
       +           end tag handler */
       +        if (!strcasecmp(t, "script") || !strcasecmp(t, "style")) {
       +                strlcpy(ignoretag, t, sizeof(ignoretag));
       +                memcpy(&xo, x, sizeof(xo)); /* store old context */
       +                memset(x, 0, sizeof(*x));
       +                x->xmltagend = xmlignoretagend;
       +                x->getnext = xo.getnext;
       +                return;
       +        }
       +
                if (!strcmp(t, "p") && isclassmatch(classname, STRP("js-tweet-text"))) {
                        if (state & (Item | Stream | Header))
                                state |= Text;