parse classname better, hide u-hidden image links, but show direct image links - tscrape - twitter scraper
 (HTM) git clone git://git.codemadness.org/tscrape
 (DIR) Log
 (DIR) Files
 (DIR) Refs
 (DIR) README
 (DIR) LICENSE
       ---
 (DIR) commit e22ef54ff11eaa0c478591c1577c9e68ad335c75
 (DIR) parent b4bc9e6b47df5b9eb612f069c20463a924d6a55e
 (HTM) Author: Hiltjo Posthuma <hiltjo@codemadness.org>
       Date:   Fri, 11 Aug 2017 16:15:48 +0200
       
       parse classname better, hide u-hidden image links, but show direct image links
       
       Diffstat:
         M tscrape.c                           |      18 ++++++++++++++++--
       
       1 file changed, 16 insertions(+), 2 deletions(-)
       ---
 (DIR) diff --git a/tscrape.c b/tscrape.c
       @@ -153,6 +153,8 @@ html_entitytostr(const char *s, char *buf, size_t bufsiz)
        static void
        xmltagend(XMLParser *x, const char *t, size_t tl, int isshort)
        {
       +        classname[0] = '\0';
       +
                if (!strcmp(t, "p"))
                        state &= ~Text;
                else if (!strcmp(t, "span"))
       @@ -162,6 +164,12 @@ xmltagend(XMLParser *x, const char *t, size_t tl, int isshort)
        }
        
        static void
       +xmltagstart(XMLParser *x, const char *t, size_t tl)
       +{
       +        classname[0] = '\0';
       +}
       +
       +static void
        xmltagstartparsed(XMLParser *x, const char *t, size_t tl, int isshort)
        {
                const char *v = classname;
       @@ -193,7 +201,6 @@ xmltagstartparsed(XMLParser *x, const char *t, size_t tl, int isshort)
                }
                if ((state & Text) && !strcmp(t, "a") && !isspace(text[0]))
                        strlcat(text, " ", sizeof(text));
       -        classname[0] = '\0';
        }
        
        static void
       @@ -206,6 +213,11 @@ xmlattr(XMLParser *x, const char *t, size_t tl, const char *a, size_t al,
                        /* UNIX timestamp */
                        strlcat(datatime, v, sizeof(datatime));
                }
       +
       +        if ((state & Item) && !strcmp(a, "data-image-url")) {
       +                strlcat(text, " ", sizeof(text));
       +                strlcat(text, v, sizeof(text));
       +        }
        }
        
        static void
       @@ -234,7 +246,8 @@ xmldata(XMLParser *x, const char *d, size_t dl)
                        strlcat(fullname, " ", sizeof(fullname));
                        strlcat(fullname, d, sizeof(fullname));
                } else if (state & Text) {
       -                strlcat(text, d, sizeof(text));
       +                if (!isclassmatch(classname, STRP("u-hidden")))
       +                        strlcat(text, d, sizeof(text));
                }
        }
        
       @@ -270,6 +283,7 @@ main(void)
                p.xmlcdata          = xmlcdata;
                p.xmldata           = xmldata;
                p.xmldataentity     = xmldataentity;
       +        p.xmltagstart       = xmltagstart;
                p.xmltagend         = xmltagend;
                p.xmltagstartparsed = xmltagstartparsed;
                /* reader (stdin) */