refactor urls into general replacement function and replace some HTML entities - tscrape - twitter scraper
(HTM) git clone git://git.codemadness.org/tscrape
(DIR) Log
(DIR) Files
(DIR) Refs
(DIR) README
(DIR) LICENSE
---
(DIR) commit ff8d2ecaed4cb56e6cc1ccdc4a43e1a3e45eb61f
(DIR) parent 62a3853f6428208e5be727175479ebcede127497
(HTM) Author: Hiltjo Posthuma <hiltjo@codemadness.org>
Date: Sun, 7 Jun 2020 00:08:42 +0200
refactor urls into general replacement function and replace some HTML entities
Diffstat:
M tscrape.c | 64 ++++++++++++++++---------------
1 file changed, 34 insertions(+), 30 deletions(-)
---
(DIR) diff --git a/tscrape.c b/tscrape.c
@@ -30,17 +30,16 @@ struct tweet {
struct tweet *next;
};
-/* url entities and their replacements */
-struct url {
- char url[256];
- size_t url_len;
- char expanded_url[1024];
+struct replacement {
+ char search[256];
+ size_t search_len;
+ char replace[1024];
- struct url *next;
+ struct replacement *next;
};
static struct tweet *tweets, *tc;
-static struct url *urls, *uc;
+static struct replacement *reps, *rc;
static char expanded_url[1024], media_url[1024], url[256];
#define MAX_PINNED 5
@@ -156,7 +155,7 @@ printescape(const char *s)
static void
printexpand(const char *s)
{
- struct url *u;
+ struct replacement *r;
for (; *s; s++) {
if (isspace((unsigned char)*s)) {
@@ -165,14 +164,14 @@ printexpand(const char *s)
} else if (iscntrl((unsigned char)*s)) {
continue;
}
- for (u = urls; u; u = u->next) {
- if (!strncmp(s, u->url, u->url_len)) {
- s += u->url_len - 1;
- printescape(u->expanded_url);
+ for (r = reps; r; r = r->next) {
+ if (!strncmp(s, r->search, r->search_len)) {
+ s += r->search_len - 1;
+ printescape(r->replace);
break;
}
}
- if (!u)
+ if (!r)
putchar(*s);
}
}
@@ -231,25 +230,25 @@ addtweet(void)
}
void
-addurl(const char *url, const char *expanded_url)
+addreplacement(const char *search, const char *replace)
{
- struct url *u;
+ struct replacement *r;
- for (u = urls; u; u = u->next) {
- if (!strncmp(url, u->url, u->url_len))
+ for (r = reps; r; r = r->next) {
+ if (!strncmp(search, r->search, r->search_len))
return;
}
- if (!(u = calloc(1, sizeof(*u))))
+ if (!(r = calloc(1, sizeof(*r))))
err(1, "calloc");
- strlcpy(u->url, url, sizeof(u->url));
- u->url_len = strlen(u->url);
- strlcpy(u->expanded_url, expanded_url, sizeof(u->expanded_url));
+ strlcpy(r->search, search, sizeof(r->search));
+ r->search_len = strlen(r->search);
+ strlcpy(r->replace, replace, sizeof(r->replace));
- if (urls)
- uc = uc->next = u;
+ if (reps)
+ rc = rc->next = r;
else
- urls = uc = u;
+ reps = rc = r;
}
void
@@ -353,7 +352,7 @@ processnodes(struct json_node *nodes, size_t depth, const char *str)
} else if (!strcmp(nodes[5].name, "expanded_url")) {
// printf("DEBUG: expanded_url: %s\n", str);
/* assumes "expanded_url" is specified after "url" */
- addurl(url, str);
+ addreplacement(url, str);
url[0] = '\0';
}
}
@@ -380,9 +379,9 @@ processnodes(struct json_node *nodes, size_t depth, const char *str)
} else if (!strcmp(nodes[5].name, "type")) {
// printf("DEBUG: type: %s\n", str);
if (!strcmp(str, "photo")) {
- addurl(url, media_url);
+ addreplacement(url, media_url);
} else {
- addurl(url, expanded_url);
+ addreplacement(url, expanded_url);
}
media_url[0] = url[0] = expanded_url[0] = '\0';
}
@@ -404,7 +403,7 @@ processnodes(struct json_node *nodes, size_t depth, const char *str)
strlcpy(url, str, sizeof(url));
} else if (!strcmp(nodes[6].name, "expanded_url")) {
// printf("DEBUG: expanded_url: %s\n", str);
- addurl(url, str);
+ addreplacement(url, str);
url[0] = '\0';
}
}
@@ -433,9 +432,9 @@ processnodes(struct json_node *nodes, size_t depth, const char *str)
} else if (!strcmp(nodes[6].name, "type")) {
// printf("DEBUG: type: %s\n", str);
if (!strcmp(str, "photo")) {
- addurl(url, media_url);
+ addreplacement(url, media_url);
} else {
- addurl(url, expanded_url);
+ addreplacement(url, expanded_url);
}
media_url[0] = url[0] = expanded_url[0] = '\0';
}
@@ -454,6 +453,11 @@ main(void)
if (parsejson(processnodes))
errx(2, "invalid JSON");
+ /* replace some HTML entities */
+ addreplacement("<", "<");
+ addreplacement(">", ">");
+ addreplacement("&", "&");
+
for (t = tweets; t; t = t->next) {
/* check for pinned tweets */
for (i = 0; i < npinned; i++) {