parse classname better, hide u-hidden image links, but show direct image links - tscrape - twitter scraper
(HTM) git clone git://git.codemadness.org/tscrape
(DIR) Log
(DIR) Files
(DIR) Refs
(DIR) README
(DIR) LICENSE
---
(DIR) commit e22ef54ff11eaa0c478591c1577c9e68ad335c75
(DIR) parent b4bc9e6b47df5b9eb612f069c20463a924d6a55e
(HTM) Author: Hiltjo Posthuma <hiltjo@codemadness.org>
Date: Fri, 11 Aug 2017 16:15:48 +0200
parse classname better, hide u-hidden image links, but show direct image links
Diffstat:
M tscrape.c | 18 ++++++++++++++++--
1 file changed, 16 insertions(+), 2 deletions(-)
---
(DIR) diff --git a/tscrape.c b/tscrape.c
@@ -153,6 +153,8 @@ html_entitytostr(const char *s, char *buf, size_t bufsiz)
static void
xmltagend(XMLParser *x, const char *t, size_t tl, int isshort)
{
+ classname[0] = '\0';
+
if (!strcmp(t, "p"))
state &= ~Text;
else if (!strcmp(t, "span"))
@@ -162,6 +164,12 @@ xmltagend(XMLParser *x, const char *t, size_t tl, int isshort)
}
static void
+xmltagstart(XMLParser *x, const char *t, size_t tl)
+{
+ classname[0] = '\0';
+}
+
+static void
xmltagstartparsed(XMLParser *x, const char *t, size_t tl, int isshort)
{
const char *v = classname;
@@ -193,7 +201,6 @@ xmltagstartparsed(XMLParser *x, const char *t, size_t tl, int isshort)
}
if ((state & Text) && !strcmp(t, "a") && !isspace(text[0]))
strlcat(text, " ", sizeof(text));
- classname[0] = '\0';
}
static void
@@ -206,6 +213,11 @@ xmlattr(XMLParser *x, const char *t, size_t tl, const char *a, size_t al,
/* UNIX timestamp */
strlcat(datatime, v, sizeof(datatime));
}
+
+ if ((state & Item) && !strcmp(a, "data-image-url")) {
+ strlcat(text, " ", sizeof(text));
+ strlcat(text, v, sizeof(text));
+ }
}
static void
@@ -234,7 +246,8 @@ xmldata(XMLParser *x, const char *d, size_t dl)
strlcat(fullname, " ", sizeof(fullname));
strlcat(fullname, d, sizeof(fullname));
} else if (state & Text) {
- strlcat(text, d, sizeof(text));
+ if (!isclassmatch(classname, STRP("u-hidden")))
+ strlcat(text, d, sizeof(text));
}
}
@@ -270,6 +283,7 @@ main(void)
p.xmlcdata = xmlcdata;
p.xmldata = xmldata;
p.xmldataentity = xmldataentity;
+ p.xmltagstart = xmltagstart;
p.xmltagend = xmltagend;
p.xmltagstartparsed = xmltagstartparsed;
/* reader (stdin) */