tscrape.c - tscrape - twitter scraper (not working anymore)
 (HTM) git clone git://git.codemadness.org/tscrape
 (DIR) Log
 (DIR) Files
 (DIR) Refs
 (DIR) README
 (DIR) LICENSE
       ---
       tscrape.c (11213B)
       ---
            1 #include <sys/types.h>
            2 
            3 #include <ctype.h>
            4 #include <err.h>
            5 #include <stdlib.h>
            6 #include <stdio.h>
            7 #include <string.h>
            8 #include <strings.h>
            9 #include <time.h>
           10 #include <unistd.h>
           11 
           12 #include "json.h"
           13 #include "util.h"
           14 
           15 #define STRP(s) s,sizeof(s)-1
           16 
           17 /* a tweet */
           18 struct tweet {
           19         char fullname[1024];
           20         int  ispinned;
           21         char itemusername[1024];
           22         char itemfullname[1024];
           23         char full_text[4096];
           24         char username[1024];
           25         time_t timestamp;
           26         char datatime[16];
           27         char itemid[64];
           28         char retweetid[64];
           29 
           30         struct tweet *next;
           31 };
           32 
           33 struct replacement {
           34         char search[256];
           35         size_t search_len;
           36         char replace[1024];
           37 
           38         struct replacement *next;
           39 };
           40 
           41 static struct tweet *tweets, *tc;
           42 static struct replacement *reps, *rc;
           43 static char expanded_url[1024], media_url[1024], url[256];
           44 
           45 #define MAX_PINNED 5
           46 static char pinnedids[MAX_PINNED][64];
           47 static size_t npinned;
           48 
           49 long long
           50 datetounix(long long year, int mon, int day, int hour, int min, int sec)
           51 {
           52         static const int secs_through_month[] = {
           53                 0, 31 * 86400, 59 * 86400, 90 * 86400,
           54                 120 * 86400, 151 * 86400, 181 * 86400, 212 * 86400,
           55                 243 * 86400, 273 * 86400, 304 * 86400, 334 * 86400 };
           56         int is_leap = 0, cycles, centuries = 0, leaps = 0, rem;
           57         long long t;
           58 
           59         if (year - 2ULL <= 136) {
           60                 leaps = (year - 68) >> 2;
           61                 if (!((year - 68) & 3)) {
           62                         leaps--;
           63                         is_leap = 1;
           64                 } else {
           65                         is_leap = 0;
           66                 }
           67                 t = 31536000 * (year - 70) + 86400 * leaps;
           68         } else {
           69                 cycles = (year - 100) / 400;
           70                 rem = (year - 100) % 400;
           71                 if (rem < 0) {
           72                         cycles--;
           73                         rem += 400;
           74                 }
           75                 if (!rem) {
           76                         is_leap = 1;
           77                 } else {
           78                         if (rem >= 300)
           79                                 centuries = 3, rem -= 300;
           80                         else if (rem >= 200)
           81                                 centuries = 2, rem -= 200;
           82                         else if (rem >= 100)
           83                                 centuries = 1, rem -= 100;
           84                         if (rem) {
           85                                 leaps = rem / 4U;
           86                                 rem %= 4U;
           87                                 is_leap = !rem;
           88                         }
           89                 }
           90                 leaps += 97 * cycles + 24 * centuries - is_leap;
           91                 t = (year - 100) * 31536000LL + leaps * 86400LL + 946684800 + 86400;
           92         }
           93         t += secs_through_month[mon];
           94         if (is_leap && mon >= 2)
           95                 t += 86400;
           96         t += 86400LL * (day - 1);
           97         t += 3600LL * hour;
           98         t += 60LL * min;
           99         t += sec;
          100 
          101         return t;
          102 }
          103 
          104 /* parse time format: "Wed May 27 04:12:34 +0000 2020"
          105    assumes tz offset is "+0000" */
          106 static int
          107 parsetime(const char *s, time_t *tp)
          108 {
          109         static char *mons[] = {
          110                 "Jan", "Feb", "Mar", "Apr", "May", "Jun",
          111                 "Jul", "Aug", "Sep", "Oct", "Nov", "Dec",
          112         };
          113         int year, mon = 0, mday, hour, min, sec, i;
          114         char tzbuf[6], monbuf[4], wdaybuf[4];
          115 
          116         for (; *s && isspace((unsigned char)*s); s++)
          117                 ;
          118         i = sscanf(s, "%3s %3s %02d %02d:%02d:%02d %5s %4d",
          119                    wdaybuf, monbuf, &mday, &hour, &min, &sec, tzbuf, &year);
          120         if (i != 8)
          121                 return -1;
          122         for (i = 0; i < sizeof(mons) / sizeof(mons[0]); i++) {
          123                 if (!strcmp(mons[i], monbuf)) {
          124                         mon = i + 1;
          125                         break;
          126                 }
          127         }
          128         if (mon == 0)
          129                 return -1;
          130 
          131         /* invalid range */
          132         if (year < 0 || year > 9999 ||
          133             mon < 1 || mon > 12 ||
          134             mday < 1 || mday > 31 ||
          135             hour < 0 || hour > 23 ||
          136             min < 0 || min> 59 ||
          137             sec < 0 || sec > 59)
          138                 return -1;
          139 
          140         if (tp)
          141                 *tp = datetounix(year - 1900, mon - 1, mday, hour, min, sec);
          142         return 0;
          143 }
          144 
          145 static void
          146 printescape(const char *s)
          147 {
          148         for (; *s; s++) {
          149                 if (!iscntrl((unsigned char)*s))
          150                         putchar(*s);
          151         }
          152 }
          153 
          154 /* print text and expand urls */
          155 static void
          156 printexpand(const char *s)
          157 {
          158         struct replacement *r;
          159 
          160         for (; *s; s++) {
          161                 if (isspace((unsigned char)*s)) {
          162                         putchar(' ');
          163                         continue;
          164                 } else if (iscntrl((unsigned char)*s)) {
          165                         continue;
          166                 }
          167                 for (r = reps; r; r = r->next) {
          168                         if (!strncmp(s, r->search, r->search_len)) {
          169                                 s += r->search_len - 1;
          170                                 printescape(r->replace);
          171                                 break;
          172                         }
          173                 }
          174                 if (!r)
          175                         putchar(*s);
          176         }
          177 }
          178 
          179 static void
          180 printtweet(struct tweet *t)
          181 {
          182         if (t->timestamp != -1)
          183                 printf("%lld", (long long)t->timestamp);
          184         putchar('\t');
          185         printescape(t->username);
          186         putchar('\t');
          187         printescape(t->fullname);
          188         putchar('\t');
          189         printexpand(t->full_text);
          190         putchar('\t');
          191         printescape(t->itemid);
          192         putchar('\t');
          193         if (t->itemusername[0])
          194                 printescape(t->itemusername);
          195         else
          196                 printescape(t->username);
          197         putchar('\t');
          198         if (t->itemfullname[0])
          199                 printescape(t->itemfullname);
          200         else
          201                 printescape(t->fullname);
          202         putchar('\t');
          203         printescape(t->retweetid);
          204         putchar('\t');
          205         printf("%d", t->ispinned);
          206         putchar('\n');
          207 }
          208 
          209 void
          210 addpinned(const char *str)
          211 {
          212         if (npinned + 1 >= MAX_PINNED)
          213                 return;
          214         strlcpy(pinnedids[npinned], str, sizeof(pinnedids[0]));
          215         npinned++;
          216 }
          217 
          218 void
          219 addtweet(void)
          220 {
          221         struct tweet *t;
          222 
          223         if (!(t = calloc(1, sizeof(*t))))
          224                 err(1, "calloc");
          225         t->timestamp = -1;
          226         if (tweets)
          227                 tc = tc->next = t;
          228         else
          229                 tweets = tc = t;
          230 }
          231 
          232 void
          233 addreplacement(const char *search, const char *replace)
          234 {
          235         struct replacement *r;
          236 
          237         for (r = reps; r; r = r->next) {
          238                 if (!strncmp(search, r->search, r->search_len))
          239                         return;
          240         }
          241 
          242         if (!(r = calloc(1, sizeof(*r))))
          243                 err(1, "calloc");
          244         strlcpy(r->search, search, sizeof(r->search));
          245         r->search_len = strlen(r->search);
          246         strlcpy(r->replace, replace, sizeof(r->replace));
          247 
          248         if (reps)
          249                 rc = rc->next = r;
          250         else
          251                 reps = rc = r;
          252 }
          253 
          254 void
          255 processnodes(struct json_node *nodes, size_t depth, const char *str)
          256 {
          257         if (depth == 2 &&
          258             nodes[0].type == JSON_TYPE_ARRAY &&
          259             nodes[1].type == JSON_TYPE_OBJECT) {
          260                 addtweet();
          261         }
          262 
          263         if (tc) {
          264                 if (depth == 3 &&
          265                     nodes[0].type == JSON_TYPE_ARRAY &&
          266                     nodes[1].type == JSON_TYPE_OBJECT &&
          267                     nodes[2].type == JSON_TYPE_STRING) {
          268                         if (!strcmp(nodes[2].name, "created_at")) {
          269                                 parsetime(str, &tc->timestamp);
          270                         } else if (!strcmp(nodes[2].name, "id_str")) {
          271                                 strlcpy(tc->itemid, str, sizeof(tc->itemid));
          272                         } else if (!strcmp(nodes[2].name, "full_text")) {
          273                                 /* if set by retweet text don't override */
          274                                 if (!tc->full_text[0])
          275                                         strlcpy(tc->full_text, str, sizeof(tc->full_text));
          276                         }
          277                 }
          278                 if (depth == 4 &&
          279                     nodes[0].type == JSON_TYPE_ARRAY &&
          280                     nodes[1].type == JSON_TYPE_OBJECT &&
          281                     nodes[2].type == JSON_TYPE_OBJECT &&
          282                     !strcmp(nodes[2].name, "user")) {
          283                         if (nodes[3].type == JSON_TYPE_STRING) {
          284                                 if (!strcmp(nodes[3].name, "name")) {
          285                                         strlcpy(tc->fullname, str, sizeof(tc->fullname));
          286                                 } else if (!strcmp(nodes[3].name, "screen_name")) {
          287                                         strlcpy(tc->username, str, sizeof(tc->username));
          288                                 }
          289                         }
          290                 }
          291 
          292                 if (depth == 4 &&
          293                     nodes[0].type == JSON_TYPE_ARRAY &&
          294                     nodes[1].type == JSON_TYPE_OBJECT &&
          295                     nodes[2].type == JSON_TYPE_OBJECT &&
          296                     nodes[3].type == JSON_TYPE_STRING &&
          297                     !strcmp(nodes[2].name, "retweeted_status")) {
          298                         if (!strcmp(nodes[3].name, "id_str")) {
          299                                 strlcpy(tc->retweetid, str, sizeof(tc->retweetid));
          300                         } else if (!strcmp(nodes[3].name, "full_text")) {
          301                                 strlcpy(tc->full_text, str, sizeof(tc->full_text));
          302                         }
          303                 }
          304 
          305                 if (depth == 5 &&
          306                     nodes[0].type == JSON_TYPE_ARRAY &&
          307                     nodes[1].type == JSON_TYPE_OBJECT &&
          308                     nodes[2].type == JSON_TYPE_OBJECT &&
          309                     nodes[3].type == JSON_TYPE_OBJECT &&
          310                     nodes[4].type == JSON_TYPE_STRING &&
          311                     !strcmp(nodes[2].name, "retweeted_status") &&
          312                     !strcmp(nodes[3].name, "user")) {
          313                         if (!strcmp(nodes[4].name, "name")) {
          314                                 strlcpy(tc->itemfullname, str, sizeof(tc->itemfullname));
          315                         } else if (!strcmp(nodes[4].name, "screen_name")) {
          316                                 strlcpy(tc->itemusername, str, sizeof(tc->itemusername));
          317                         }
          318                 }
          319         }
          320 
          321         if (depth == 5 &&
          322             nodes[0].type == JSON_TYPE_ARRAY &&
          323             nodes[1].type == JSON_TYPE_OBJECT &&
          324             nodes[2].type == JSON_TYPE_OBJECT &&
          325             !strcmp(nodes[2].name, "user")) {
          326                 if (nodes[3].type == JSON_TYPE_ARRAY &&
          327                     !strcmp(nodes[3].name, "pinned_tweet_ids")) {
          328                         if (nodes[4].type == JSON_TYPE_NUMBER) {
          329                                 addpinned(str);
          330                         }
          331                 }
          332         }
          333 
          334         if (depth == 6 &&
          335             nodes[0].type == JSON_TYPE_ARRAY &&
          336             nodes[1].type == JSON_TYPE_OBJECT &&
          337             nodes[2].type == JSON_TYPE_OBJECT &&
          338             nodes[3].type == JSON_TYPE_ARRAY &&
          339             nodes[4].type == JSON_TYPE_OBJECT &&
          340             nodes[5].type == JSON_TYPE_STRING &&
          341             !strcmp(nodes[2].name, "entities") &&
          342             !strcmp(nodes[3].name, "urls")) {
          343                 if (!strcmp(nodes[5].name, "url")) {
          344                         strlcpy(url, str, sizeof(url));
          345                 } else if (!strcmp(nodes[5].name, "expanded_url")) {
          346                         /* assumes "expanded_url" is specified after "url" */
          347                         addreplacement(url, str);
          348                         url[0] = '\0';
          349                 }
          350         }
          351 
          352         /* [].extended_entities.media[].url */
          353         if (depth == 6 &&
          354             nodes[0].type == JSON_TYPE_ARRAY &&
          355             nodes[1].type == JSON_TYPE_OBJECT &&
          356             nodes[2].type == JSON_TYPE_OBJECT &&
          357             nodes[3].type == JSON_TYPE_ARRAY &&
          358             nodes[4].type == JSON_TYPE_OBJECT &&
          359             nodes[5].type == JSON_TYPE_STRING &&
          360             !strcmp(nodes[2].name, "extended_entities") &&
          361             !strcmp(nodes[3].name, "media")) {
          362                 if (!strcmp(nodes[5].name, "media_url_https")) {
          363                         strlcpy(media_url, str, sizeof(media_url));
          364                 } else if (!strcmp(nodes[5].name, "url")) {
          365                         strlcpy(url, str, sizeof(url));
          366                 } else if (!strcmp(nodes[5].name, "expanded_url")) {
          367                         strlcpy(expanded_url, str, sizeof(expanded_url));
          368                 } else if (!strcmp(nodes[5].name, "type")) {
          369                         if (!strcmp(str, "photo")) {
          370                                 addreplacement(url, media_url);
          371                         } else {
          372                                 addreplacement(url, expanded_url);
          373                         }
          374                         media_url[0] = url[0] = expanded_url[0] = '\0';
          375                 }
          376         }
          377 
          378         if (depth == 7 &&
          379             nodes[0].type == JSON_TYPE_ARRAY &&
          380             nodes[1].type == JSON_TYPE_OBJECT &&
          381             nodes[2].type == JSON_TYPE_OBJECT &&
          382             nodes[3].type == JSON_TYPE_OBJECT &&
          383             nodes[4].type == JSON_TYPE_ARRAY &&
          384             nodes[5].type == JSON_TYPE_OBJECT &&
          385             nodes[6].type == JSON_TYPE_STRING &&
          386             !strcmp(nodes[2].name, "retweeted_status") &&
          387             !strcmp(nodes[3].name, "entities") &&
          388             !strcmp(nodes[4].name, "urls")) {
          389                 if (!strcmp(nodes[6].name, "url")) {
          390                         strlcpy(url, str, sizeof(url));
          391                 } else if (!strcmp(nodes[6].name, "expanded_url")) {
          392                         addreplacement(url, str);
          393                         url[0] = '\0';
          394                 }
          395         }
          396 
          397         /* [].retweeted_status.extended_entities.media[].url */
          398         if (depth == 7 &&
          399             nodes[0].type == JSON_TYPE_ARRAY &&
          400             nodes[1].type == JSON_TYPE_OBJECT &&
          401             nodes[2].type == JSON_TYPE_OBJECT &&
          402             nodes[3].type == JSON_TYPE_OBJECT &&
          403             nodes[4].type == JSON_TYPE_ARRAY &&
          404             nodes[5].type == JSON_TYPE_OBJECT &&
          405             nodes[6].type == JSON_TYPE_STRING &&
          406             !strcmp(nodes[2].name, "retweeted_status") &&
          407             !strcmp(nodes[3].name, "extended_entities") &&
          408             !strcmp(nodes[4].name, "media")) {
          409                 if (!strcmp(nodes[6].name, "media_url_https")) {
          410                         strlcpy(media_url, str, sizeof(media_url));
          411                 } else if (!strcmp(nodes[6].name, "url")) {
          412                         strlcpy(url, str, sizeof(url));
          413                 } else if (!strcmp(nodes[6].name, "expanded_url")) {
          414                         strlcpy(expanded_url, str, sizeof(expanded_url));
          415                 } else if (!strcmp(nodes[6].name, "type")) {
          416                         if (!strcmp(str, "photo")) {
          417                                 addreplacement(url, media_url);
          418                         } else {
          419                                 addreplacement(url, expanded_url);
          420                         }
          421                         media_url[0] = url[0] = expanded_url[0] = '\0';
          422                 }
          423         }
          424 }
          425 
          426 int
          427 main(void)
          428 {
          429         struct tweet *t;
          430         size_t i;
          431 
          432         if (pledge("stdio", NULL) == -1)
          433                 err(1, "pledge");
          434 
          435         if (parsejson(processnodes))
          436                 errx(2, "invalid JSON");
          437 
          438         /* replace some HTML entities */
          439         addreplacement("&lt;", "<");
          440         addreplacement("&gt;", ">");
          441         addreplacement("&amp;", "&");
          442 
          443         for (t = tweets; t; t = t->next) {
          444                 /* check for pinned tweets */
          445                 for (i = 0; i < npinned; i++) {
          446                         if (!strcmp(t->itemid, pinnedids[i])) {
          447                                 t->ispinned = 1;
          448                                 break;
          449                         }
          450                 }
          451                 printtweet(t);
          452         }
          453 
          454         return 0;
          455 }