gopher-crawler.c - randomcrap - random crap programs of varying quality
 (HTM) git clone git://git.codemadness.org/randomcrap
 (DIR) Log
 (DIR) Files
 (DIR) Refs
 (DIR) README
 (DIR) LICENSE
       ---
       gopher-crawler.c (4813B)
       ---
            1 #include <sys/socket.h>
            2 #include <sys/time.h>
            3 #include <sys/tree.h> /* RB-tree */
            4 #include <sys/types.h>
            5 
            6 #include <err.h>
            7 #include <errno.h>
            8 #include <netdb.h>
            9 #include <stdio.h>
           10 #include <stdlib.h>
           11 #include <string.h>
           12 #include <unistd.h>
           13 
           14 struct visited {
           15         RB_ENTRY(visited) entry;
           16         char server[256];
           17         char port[8];
           18         char path[1024];
           19 };
           20 
           21 const int config_localonly = 1;
           22 const char *config_server = "git.codemadness.org";
           23 /*const char *config_server = "bitreich.org";*/
           24 const char *config_port = "70";
           25 
           26 int
           27 visitedcmp(struct visited *v1, struct visited *v2)
           28 {
           29         int n;
           30 
           31         if ((n = strcmp(v1->server, v2->server)))
           32                 return n;
           33         if ((n = strcmp(v1->port, v2->port)))
           34                 return n;
           35         if ((n = strcmp(v1->path, v2->path)))
           36                 return n;
           37         return n;
           38 }
           39 
           40 RB_HEAD(vistree, visited) head = RB_INITIALIZER(&head);
           41 RB_GENERATE(vistree, visited, entry, visitedcmp)
           42 
           43 int
           44 dial(const char *host, const char *port)
           45 {
           46         struct addrinfo hints, *res, *res0;
           47         int error, save_errno, s;
           48         const char *cause = NULL;
           49         struct timeval timeout = {
           50                 .tv_sec = 10,
           51                 .tv_usec = 0,
           52         };
           53 
           54         memset(&hints, 0, sizeof(hints));
           55         hints.ai_family = AF_UNSPEC;
           56         hints.ai_socktype = SOCK_STREAM;
           57         if ((error = getaddrinfo(host, port, &hints, &res0))) {
           58                 warnx("%s: %s:%s", gai_strerror(error), host, port);
           59                 return -1;
           60         }
           61         s = -1;
           62         for (res = res0; res; res = res->ai_next) {
           63                 s = socket(res->ai_family, res->ai_socktype,
           64                            res->ai_protocol);
           65                 if (s == -1) {
           66                         cause = "socket";
           67                         continue;
           68                 }
           69 
           70                 if (setsockopt(s, SOL_SOCKET, SO_SNDTIMEO, &timeout, sizeof(timeout)) == -1)
           71                         err(1, "setsockopt");
           72                 if (setsockopt(s, SOL_SOCKET, SO_RCVTIMEO, &timeout, sizeof(timeout)) == -1)
           73                         err(1, "setsockopt");
           74 
           75                 if (connect(s, res->ai_addr, res->ai_addrlen) == -1) {
           76                         cause = "connect";
           77                         save_errno = errno;
           78                         close(s);
           79                         errno = save_errno;
           80                         s = -1;
           81                         continue;
           82                 }
           83                 break;
           84         }
           85         if (s == -1)
           86                 warn("%s: %s:%s", cause, host, port);
           87         freeaddrinfo(res0);
           88 
           89         return s;
           90 }
           91 
           92 int
           93 blacklisted(const char *host, const char *port, const char *path)
           94 {
           95         char *p;
           96 
           97         if ((p = strstr(host, ".onion")) && strlen(p) == strlen(".onion"))
           98                 return 1;
           99         if (strstr(path, "/git/"))
          100                 return 1;
          101         if (strstr(path, "/scm/"))
          102                 return 1;
          103         if (strstr(path, "/commit/"))
          104                 return 1;
          105         return 0;
          106 }
          107 
          108 int
          109 crawl(const char *server, const char *port, const char *path)
          110 {
          111         struct visited v, *ve;
          112         FILE *fp;
          113         char line[1024];
          114         size_t linenr;
          115         ssize_t n;
          116         int fd, r, i, len;
          117         size_t totalsiz;
          118 
          119         if (!server[0] || !port[0] || blacklisted(server, port, path))
          120                 return -1;
          121         if (config_localonly) {
          122                 if (strcmp(server, config_server) ||
          123                     strcmp(port, config_port)) {
          124                             fprintf(stderr, "DEBUG: skipping: %s:%s path '%s'\n",
          125                                 server, port, path);
          126                         return -1;
          127                 }
          128         }
          129 
          130         if (!(ve = malloc(sizeof(*ve))))
          131                 err(1, "malloc");
          132         strlcpy(ve->server, server, sizeof(ve->server));
          133         strlcpy(ve->port, port, sizeof(ve->port));
          134         strlcpy(ve->path, path, sizeof(ve->path));
          135         RB_INSERT(vistree, &head, ve);
          136 
          137         if ((fd = dial(server, port)) == -1)
          138                 return -1;
          139 
          140         r = dprintf(fd, "%s\r\n", path);
          141         if (r == -1)
          142                 err(1, "write");
          143 
          144         if (!(fp = fdopen(fd, "rb+")))
          145                 err(1, "fdopen");
          146 
          147         totalsiz = 0;
          148         for (linenr = 0; fgets(line, sizeof(line), fp); ++linenr) {
          149                 n = strcspn(line, "\n");
          150                 if (line[n] != '\n') {
          151                         break; /* line too long: skip this page */
          152                 }
          153                 if (n && line[n] == '\n')
          154                         line[n] = '\0';
          155                 if (n && line[n - 1] == '\r')
          156                         line[--n] = '\0';
          157                 if (n == 1 && line[0] == '.')
          158                         break;
          159 
          160                 /* too big total response */
          161                 totalsiz += n;
          162                 if (totalsiz > 1048576)
          163                         break;
          164 
          165                 printf("%s\t%s\t%zu\t%s\t%s\n",
          166                        server, port, linenr, path, line);
          167 
          168                 /* directory */
          169                 if (line[0] != '1')
          170                         continue;
          171 
          172                 /* skip "username" */
          173                 i = 1;
          174                 len = strcspn(line + i, "\t");
          175                 if (line[i + len] == '\t')
          176                         i += len + 1;
          177                 else
          178                         continue; /* invalid field */
          179 
          180                 memset(&v, 0, sizeof(v));
          181 
          182                 /* selector / path */
          183                 len = strcspn(line + i, "\t");
          184                 if (len + 1 < sizeof(v.path)) {
          185                         memcpy(v.path, line + i, len);
          186                         v.path[len] = '\0';
          187                 } else {
          188                         /* too long path */
          189                         continue;
          190                 }
          191                 if (line[i + len] == '\t')
          192                         i += len + 1;
          193                 else
          194                         continue; /* invalid field */
          195 
          196                 /* server */
          197                 len = strcspn(line + i, "\t");
          198                 if (len + 1 < sizeof(v.server)) {
          199                         memcpy(v.server, line + i, len);
          200                         v.server[len] = '\0';
          201                 } else {
          202                         /* too long server */
          203                         continue;
          204                 }
          205                 if (line[i + len] == '\t')
          206                         i += len + 1;
          207                 else
          208                         continue; /* invalid field */
          209 
          210                 /* port */
          211                 len = strcspn(line + i, "\t");
          212                 if (len + 1 < sizeof(v.port)) {
          213                         memcpy(v.port, line + i, len);
          214                         v.port[len] = '\0';
          215                 } else {
          216                         /* too long port */
          217                         continue;
          218                 }
          219 
          220                 if (RB_FIND(vistree, &head, &v) == NULL)
          221                         crawl(v.server, v.port, v.path);
          222         }
          223         if (ferror(fp))
          224                 err(1, "fgets");
          225         fclose(fp);
          226 
          227         return 0;
          228 }
          229 
          230 int
          231 main(void)
          232 {
          233         return crawl(config_server, config_port, "");
          234 }