tscrape.c - tscrape - twitter scraper (not working anymore)
(HTM) git clone git://git.codemadness.org/tscrape
(DIR) Log
(DIR) Files
(DIR) Refs
(DIR) README
(DIR) LICENSE
---
tscrape.c (11213B)
---
1 #include <sys/types.h>
2
3 #include <ctype.h>
4 #include <err.h>
5 #include <stdlib.h>
6 #include <stdio.h>
7 #include <string.h>
8 #include <strings.h>
9 #include <time.h>
10 #include <unistd.h>
11
12 #include "json.h"
13 #include "util.h"
14
15 #define STRP(s) s,sizeof(s)-1
16
17 /* a tweet */
18 struct tweet {
19 char fullname[1024];
20 int ispinned;
21 char itemusername[1024];
22 char itemfullname[1024];
23 char full_text[4096];
24 char username[1024];
25 time_t timestamp;
26 char datatime[16];
27 char itemid[64];
28 char retweetid[64];
29
30 struct tweet *next;
31 };
32
33 struct replacement {
34 char search[256];
35 size_t search_len;
36 char replace[1024];
37
38 struct replacement *next;
39 };
40
41 static struct tweet *tweets, *tc;
42 static struct replacement *reps, *rc;
43 static char expanded_url[1024], media_url[1024], url[256];
44
45 #define MAX_PINNED 5
46 static char pinnedids[MAX_PINNED][64];
47 static size_t npinned;
48
49 long long
50 datetounix(long long year, int mon, int day, int hour, int min, int sec)
51 {
52 static const int secs_through_month[] = {
53 0, 31 * 86400, 59 * 86400, 90 * 86400,
54 120 * 86400, 151 * 86400, 181 * 86400, 212 * 86400,
55 243 * 86400, 273 * 86400, 304 * 86400, 334 * 86400 };
56 int is_leap = 0, cycles, centuries = 0, leaps = 0, rem;
57 long long t;
58
59 if (year - 2ULL <= 136) {
60 leaps = (year - 68) >> 2;
61 if (!((year - 68) & 3)) {
62 leaps--;
63 is_leap = 1;
64 } else {
65 is_leap = 0;
66 }
67 t = 31536000 * (year - 70) + 86400 * leaps;
68 } else {
69 cycles = (year - 100) / 400;
70 rem = (year - 100) % 400;
71 if (rem < 0) {
72 cycles--;
73 rem += 400;
74 }
75 if (!rem) {
76 is_leap = 1;
77 } else {
78 if (rem >= 300)
79 centuries = 3, rem -= 300;
80 else if (rem >= 200)
81 centuries = 2, rem -= 200;
82 else if (rem >= 100)
83 centuries = 1, rem -= 100;
84 if (rem) {
85 leaps = rem / 4U;
86 rem %= 4U;
87 is_leap = !rem;
88 }
89 }
90 leaps += 97 * cycles + 24 * centuries - is_leap;
91 t = (year - 100) * 31536000LL + leaps * 86400LL + 946684800 + 86400;
92 }
93 t += secs_through_month[mon];
94 if (is_leap && mon >= 2)
95 t += 86400;
96 t += 86400LL * (day - 1);
97 t += 3600LL * hour;
98 t += 60LL * min;
99 t += sec;
100
101 return t;
102 }
103
104 /* parse time format: "Wed May 27 04:12:34 +0000 2020"
105 assumes tz offset is "+0000" */
106 static int
107 parsetime(const char *s, time_t *tp)
108 {
109 static char *mons[] = {
110 "Jan", "Feb", "Mar", "Apr", "May", "Jun",
111 "Jul", "Aug", "Sep", "Oct", "Nov", "Dec",
112 };
113 int year, mon = 0, mday, hour, min, sec, i;
114 char tzbuf[6], monbuf[4], wdaybuf[4];
115
116 for (; *s && isspace((unsigned char)*s); s++)
117 ;
118 i = sscanf(s, "%3s %3s %02d %02d:%02d:%02d %5s %4d",
119 wdaybuf, monbuf, &mday, &hour, &min, &sec, tzbuf, &year);
120 if (i != 8)
121 return -1;
122 for (i = 0; i < sizeof(mons) / sizeof(mons[0]); i++) {
123 if (!strcmp(mons[i], monbuf)) {
124 mon = i + 1;
125 break;
126 }
127 }
128 if (mon == 0)
129 return -1;
130
131 /* invalid range */
132 if (year < 0 || year > 9999 ||
133 mon < 1 || mon > 12 ||
134 mday < 1 || mday > 31 ||
135 hour < 0 || hour > 23 ||
136 min < 0 || min> 59 ||
137 sec < 0 || sec > 59)
138 return -1;
139
140 if (tp)
141 *tp = datetounix(year - 1900, mon - 1, mday, hour, min, sec);
142 return 0;
143 }
144
145 static void
146 printescape(const char *s)
147 {
148 for (; *s; s++) {
149 if (!iscntrl((unsigned char)*s))
150 putchar(*s);
151 }
152 }
153
154 /* print text and expand urls */
155 static void
156 printexpand(const char *s)
157 {
158 struct replacement *r;
159
160 for (; *s; s++) {
161 if (isspace((unsigned char)*s)) {
162 putchar(' ');
163 continue;
164 } else if (iscntrl((unsigned char)*s)) {
165 continue;
166 }
167 for (r = reps; r; r = r->next) {
168 if (!strncmp(s, r->search, r->search_len)) {
169 s += r->search_len - 1;
170 printescape(r->replace);
171 break;
172 }
173 }
174 if (!r)
175 putchar(*s);
176 }
177 }
178
179 static void
180 printtweet(struct tweet *t)
181 {
182 if (t->timestamp != -1)
183 printf("%lld", (long long)t->timestamp);
184 putchar('\t');
185 printescape(t->username);
186 putchar('\t');
187 printescape(t->fullname);
188 putchar('\t');
189 printexpand(t->full_text);
190 putchar('\t');
191 printescape(t->itemid);
192 putchar('\t');
193 if (t->itemusername[0])
194 printescape(t->itemusername);
195 else
196 printescape(t->username);
197 putchar('\t');
198 if (t->itemfullname[0])
199 printescape(t->itemfullname);
200 else
201 printescape(t->fullname);
202 putchar('\t');
203 printescape(t->retweetid);
204 putchar('\t');
205 printf("%d", t->ispinned);
206 putchar('\n');
207 }
208
209 void
210 addpinned(const char *str)
211 {
212 if (npinned + 1 >= MAX_PINNED)
213 return;
214 strlcpy(pinnedids[npinned], str, sizeof(pinnedids[0]));
215 npinned++;
216 }
217
218 void
219 addtweet(void)
220 {
221 struct tweet *t;
222
223 if (!(t = calloc(1, sizeof(*t))))
224 err(1, "calloc");
225 t->timestamp = -1;
226 if (tweets)
227 tc = tc->next = t;
228 else
229 tweets = tc = t;
230 }
231
232 void
233 addreplacement(const char *search, const char *replace)
234 {
235 struct replacement *r;
236
237 for (r = reps; r; r = r->next) {
238 if (!strncmp(search, r->search, r->search_len))
239 return;
240 }
241
242 if (!(r = calloc(1, sizeof(*r))))
243 err(1, "calloc");
244 strlcpy(r->search, search, sizeof(r->search));
245 r->search_len = strlen(r->search);
246 strlcpy(r->replace, replace, sizeof(r->replace));
247
248 if (reps)
249 rc = rc->next = r;
250 else
251 reps = rc = r;
252 }
253
254 void
255 processnodes(struct json_node *nodes, size_t depth, const char *str)
256 {
257 if (depth == 2 &&
258 nodes[0].type == JSON_TYPE_ARRAY &&
259 nodes[1].type == JSON_TYPE_OBJECT) {
260 addtweet();
261 }
262
263 if (tc) {
264 if (depth == 3 &&
265 nodes[0].type == JSON_TYPE_ARRAY &&
266 nodes[1].type == JSON_TYPE_OBJECT &&
267 nodes[2].type == JSON_TYPE_STRING) {
268 if (!strcmp(nodes[2].name, "created_at")) {
269 parsetime(str, &tc->timestamp);
270 } else if (!strcmp(nodes[2].name, "id_str")) {
271 strlcpy(tc->itemid, str, sizeof(tc->itemid));
272 } else if (!strcmp(nodes[2].name, "full_text")) {
273 /* if set by retweet text don't override */
274 if (!tc->full_text[0])
275 strlcpy(tc->full_text, str, sizeof(tc->full_text));
276 }
277 }
278 if (depth == 4 &&
279 nodes[0].type == JSON_TYPE_ARRAY &&
280 nodes[1].type == JSON_TYPE_OBJECT &&
281 nodes[2].type == JSON_TYPE_OBJECT &&
282 !strcmp(nodes[2].name, "user")) {
283 if (nodes[3].type == JSON_TYPE_STRING) {
284 if (!strcmp(nodes[3].name, "name")) {
285 strlcpy(tc->fullname, str, sizeof(tc->fullname));
286 } else if (!strcmp(nodes[3].name, "screen_name")) {
287 strlcpy(tc->username, str, sizeof(tc->username));
288 }
289 }
290 }
291
292 if (depth == 4 &&
293 nodes[0].type == JSON_TYPE_ARRAY &&
294 nodes[1].type == JSON_TYPE_OBJECT &&
295 nodes[2].type == JSON_TYPE_OBJECT &&
296 nodes[3].type == JSON_TYPE_STRING &&
297 !strcmp(nodes[2].name, "retweeted_status")) {
298 if (!strcmp(nodes[3].name, "id_str")) {
299 strlcpy(tc->retweetid, str, sizeof(tc->retweetid));
300 } else if (!strcmp(nodes[3].name, "full_text")) {
301 strlcpy(tc->full_text, str, sizeof(tc->full_text));
302 }
303 }
304
305 if (depth == 5 &&
306 nodes[0].type == JSON_TYPE_ARRAY &&
307 nodes[1].type == JSON_TYPE_OBJECT &&
308 nodes[2].type == JSON_TYPE_OBJECT &&
309 nodes[3].type == JSON_TYPE_OBJECT &&
310 nodes[4].type == JSON_TYPE_STRING &&
311 !strcmp(nodes[2].name, "retweeted_status") &&
312 !strcmp(nodes[3].name, "user")) {
313 if (!strcmp(nodes[4].name, "name")) {
314 strlcpy(tc->itemfullname, str, sizeof(tc->itemfullname));
315 } else if (!strcmp(nodes[4].name, "screen_name")) {
316 strlcpy(tc->itemusername, str, sizeof(tc->itemusername));
317 }
318 }
319 }
320
321 if (depth == 5 &&
322 nodes[0].type == JSON_TYPE_ARRAY &&
323 nodes[1].type == JSON_TYPE_OBJECT &&
324 nodes[2].type == JSON_TYPE_OBJECT &&
325 !strcmp(nodes[2].name, "user")) {
326 if (nodes[3].type == JSON_TYPE_ARRAY &&
327 !strcmp(nodes[3].name, "pinned_tweet_ids")) {
328 if (nodes[4].type == JSON_TYPE_NUMBER) {
329 addpinned(str);
330 }
331 }
332 }
333
334 if (depth == 6 &&
335 nodes[0].type == JSON_TYPE_ARRAY &&
336 nodes[1].type == JSON_TYPE_OBJECT &&
337 nodes[2].type == JSON_TYPE_OBJECT &&
338 nodes[3].type == JSON_TYPE_ARRAY &&
339 nodes[4].type == JSON_TYPE_OBJECT &&
340 nodes[5].type == JSON_TYPE_STRING &&
341 !strcmp(nodes[2].name, "entities") &&
342 !strcmp(nodes[3].name, "urls")) {
343 if (!strcmp(nodes[5].name, "url")) {
344 strlcpy(url, str, sizeof(url));
345 } else if (!strcmp(nodes[5].name, "expanded_url")) {
346 /* assumes "expanded_url" is specified after "url" */
347 addreplacement(url, str);
348 url[0] = '\0';
349 }
350 }
351
352 /* [].extended_entities.media[].url */
353 if (depth == 6 &&
354 nodes[0].type == JSON_TYPE_ARRAY &&
355 nodes[1].type == JSON_TYPE_OBJECT &&
356 nodes[2].type == JSON_TYPE_OBJECT &&
357 nodes[3].type == JSON_TYPE_ARRAY &&
358 nodes[4].type == JSON_TYPE_OBJECT &&
359 nodes[5].type == JSON_TYPE_STRING &&
360 !strcmp(nodes[2].name, "extended_entities") &&
361 !strcmp(nodes[3].name, "media")) {
362 if (!strcmp(nodes[5].name, "media_url_https")) {
363 strlcpy(media_url, str, sizeof(media_url));
364 } else if (!strcmp(nodes[5].name, "url")) {
365 strlcpy(url, str, sizeof(url));
366 } else if (!strcmp(nodes[5].name, "expanded_url")) {
367 strlcpy(expanded_url, str, sizeof(expanded_url));
368 } else if (!strcmp(nodes[5].name, "type")) {
369 if (!strcmp(str, "photo")) {
370 addreplacement(url, media_url);
371 } else {
372 addreplacement(url, expanded_url);
373 }
374 media_url[0] = url[0] = expanded_url[0] = '\0';
375 }
376 }
377
378 if (depth == 7 &&
379 nodes[0].type == JSON_TYPE_ARRAY &&
380 nodes[1].type == JSON_TYPE_OBJECT &&
381 nodes[2].type == JSON_TYPE_OBJECT &&
382 nodes[3].type == JSON_TYPE_OBJECT &&
383 nodes[4].type == JSON_TYPE_ARRAY &&
384 nodes[5].type == JSON_TYPE_OBJECT &&
385 nodes[6].type == JSON_TYPE_STRING &&
386 !strcmp(nodes[2].name, "retweeted_status") &&
387 !strcmp(nodes[3].name, "entities") &&
388 !strcmp(nodes[4].name, "urls")) {
389 if (!strcmp(nodes[6].name, "url")) {
390 strlcpy(url, str, sizeof(url));
391 } else if (!strcmp(nodes[6].name, "expanded_url")) {
392 addreplacement(url, str);
393 url[0] = '\0';
394 }
395 }
396
397 /* [].retweeted_status.extended_entities.media[].url */
398 if (depth == 7 &&
399 nodes[0].type == JSON_TYPE_ARRAY &&
400 nodes[1].type == JSON_TYPE_OBJECT &&
401 nodes[2].type == JSON_TYPE_OBJECT &&
402 nodes[3].type == JSON_TYPE_OBJECT &&
403 nodes[4].type == JSON_TYPE_ARRAY &&
404 nodes[5].type == JSON_TYPE_OBJECT &&
405 nodes[6].type == JSON_TYPE_STRING &&
406 !strcmp(nodes[2].name, "retweeted_status") &&
407 !strcmp(nodes[3].name, "extended_entities") &&
408 !strcmp(nodes[4].name, "media")) {
409 if (!strcmp(nodes[6].name, "media_url_https")) {
410 strlcpy(media_url, str, sizeof(media_url));
411 } else if (!strcmp(nodes[6].name, "url")) {
412 strlcpy(url, str, sizeof(url));
413 } else if (!strcmp(nodes[6].name, "expanded_url")) {
414 strlcpy(expanded_url, str, sizeof(expanded_url));
415 } else if (!strcmp(nodes[6].name, "type")) {
416 if (!strcmp(str, "photo")) {
417 addreplacement(url, media_url);
418 } else {
419 addreplacement(url, expanded_url);
420 }
421 media_url[0] = url[0] = expanded_url[0] = '\0';
422 }
423 }
424 }
425
426 int
427 main(void)
428 {
429 struct tweet *t;
430 size_t i;
431
432 if (pledge("stdio", NULL) == -1)
433 err(1, "pledge");
434
435 if (parsejson(processnodes))
436 errx(2, "invalid JSON");
437
438 /* replace some HTML entities */
439 addreplacement("<", "<");
440 addreplacement(">", ">");
441 addreplacement("&", "&");
442
443 for (t = tweets; t; t = t->next) {
444 /* check for pinned tweets */
445 for (i = 0; i < npinned; i++) {
446 if (!strcmp(t->itemid, pinnedids[i])) {
447 t->ispinned = 1;
448 break;
449 }
450 }
451 printtweet(t);
452 }
453
454 return 0;
455 }