gopher-crawler.c - randomcrap - random crap programs of varying quality
(HTM) git clone git://git.codemadness.org/randomcrap
(DIR) Log
(DIR) Files
(DIR) Refs
(DIR) README
(DIR) LICENSE
---
gopher-crawler.c (4813B)
---
1 #include <sys/socket.h>
2 #include <sys/time.h>
3 #include <sys/tree.h> /* RB-tree */
4 #include <sys/types.h>
5
6 #include <err.h>
7 #include <errno.h>
8 #include <netdb.h>
9 #include <stdio.h>
10 #include <stdlib.h>
11 #include <string.h>
12 #include <unistd.h>
13
14 struct visited {
15 RB_ENTRY(visited) entry;
16 char server[256];
17 char port[8];
18 char path[1024];
19 };
20
21 const int config_localonly = 1;
22 const char *config_server = "git.codemadness.org";
23 /*const char *config_server = "bitreich.org";*/
24 const char *config_port = "70";
25
26 int
27 visitedcmp(struct visited *v1, struct visited *v2)
28 {
29 int n;
30
31 if ((n = strcmp(v1->server, v2->server)))
32 return n;
33 if ((n = strcmp(v1->port, v2->port)))
34 return n;
35 if ((n = strcmp(v1->path, v2->path)))
36 return n;
37 return n;
38 }
39
40 RB_HEAD(vistree, visited) head = RB_INITIALIZER(&head);
41 RB_GENERATE(vistree, visited, entry, visitedcmp)
42
43 int
44 dial(const char *host, const char *port)
45 {
46 struct addrinfo hints, *res, *res0;
47 int error, save_errno, s;
48 const char *cause = NULL;
49 struct timeval timeout = {
50 .tv_sec = 10,
51 .tv_usec = 0,
52 };
53
54 memset(&hints, 0, sizeof(hints));
55 hints.ai_family = AF_UNSPEC;
56 hints.ai_socktype = SOCK_STREAM;
57 if ((error = getaddrinfo(host, port, &hints, &res0))) {
58 warnx("%s: %s:%s", gai_strerror(error), host, port);
59 return -1;
60 }
61 s = -1;
62 for (res = res0; res; res = res->ai_next) {
63 s = socket(res->ai_family, res->ai_socktype,
64 res->ai_protocol);
65 if (s == -1) {
66 cause = "socket";
67 continue;
68 }
69
70 if (setsockopt(s, SOL_SOCKET, SO_SNDTIMEO, &timeout, sizeof(timeout)) == -1)
71 err(1, "setsockopt");
72 if (setsockopt(s, SOL_SOCKET, SO_RCVTIMEO, &timeout, sizeof(timeout)) == -1)
73 err(1, "setsockopt");
74
75 if (connect(s, res->ai_addr, res->ai_addrlen) == -1) {
76 cause = "connect";
77 save_errno = errno;
78 close(s);
79 errno = save_errno;
80 s = -1;
81 continue;
82 }
83 break;
84 }
85 if (s == -1)
86 warn("%s: %s:%s", cause, host, port);
87 freeaddrinfo(res0);
88
89 return s;
90 }
91
92 int
93 blacklisted(const char *host, const char *port, const char *path)
94 {
95 char *p;
96
97 if ((p = strstr(host, ".onion")) && strlen(p) == strlen(".onion"))
98 return 1;
99 if (strstr(path, "/git/"))
100 return 1;
101 if (strstr(path, "/scm/"))
102 return 1;
103 if (strstr(path, "/commit/"))
104 return 1;
105 return 0;
106 }
107
108 int
109 crawl(const char *server, const char *port, const char *path)
110 {
111 struct visited v, *ve;
112 FILE *fp;
113 char line[1024];
114 size_t linenr;
115 ssize_t n;
116 int fd, r, i, len;
117 size_t totalsiz;
118
119 if (!server[0] || !port[0] || blacklisted(server, port, path))
120 return -1;
121 if (config_localonly) {
122 if (strcmp(server, config_server) ||
123 strcmp(port, config_port)) {
124 fprintf(stderr, "DEBUG: skipping: %s:%s path '%s'\n",
125 server, port, path);
126 return -1;
127 }
128 }
129
130 if (!(ve = malloc(sizeof(*ve))))
131 err(1, "malloc");
132 strlcpy(ve->server, server, sizeof(ve->server));
133 strlcpy(ve->port, port, sizeof(ve->port));
134 strlcpy(ve->path, path, sizeof(ve->path));
135 RB_INSERT(vistree, &head, ve);
136
137 if ((fd = dial(server, port)) == -1)
138 return -1;
139
140 r = dprintf(fd, "%s\r\n", path);
141 if (r == -1)
142 err(1, "write");
143
144 if (!(fp = fdopen(fd, "rb+")))
145 err(1, "fdopen");
146
147 totalsiz = 0;
148 for (linenr = 0; fgets(line, sizeof(line), fp); ++linenr) {
149 n = strcspn(line, "\n");
150 if (line[n] != '\n') {
151 break; /* line too long: skip this page */
152 }
153 if (n && line[n] == '\n')
154 line[n] = '\0';
155 if (n && line[n - 1] == '\r')
156 line[--n] = '\0';
157 if (n == 1 && line[0] == '.')
158 break;
159
160 /* too big total response */
161 totalsiz += n;
162 if (totalsiz > 1048576)
163 break;
164
165 printf("%s\t%s\t%zu\t%s\t%s\n",
166 server, port, linenr, path, line);
167
168 /* directory */
169 if (line[0] != '1')
170 continue;
171
172 /* skip "username" */
173 i = 1;
174 len = strcspn(line + i, "\t");
175 if (line[i + len] == '\t')
176 i += len + 1;
177 else
178 continue; /* invalid field */
179
180 memset(&v, 0, sizeof(v));
181
182 /* selector / path */
183 len = strcspn(line + i, "\t");
184 if (len + 1 < sizeof(v.path)) {
185 memcpy(v.path, line + i, len);
186 v.path[len] = '\0';
187 } else {
188 /* too long path */
189 continue;
190 }
191 if (line[i + len] == '\t')
192 i += len + 1;
193 else
194 continue; /* invalid field */
195
196 /* server */
197 len = strcspn(line + i, "\t");
198 if (len + 1 < sizeof(v.server)) {
199 memcpy(v.server, line + i, len);
200 v.server[len] = '\0';
201 } else {
202 /* too long server */
203 continue;
204 }
205 if (line[i + len] == '\t')
206 i += len + 1;
207 else
208 continue; /* invalid field */
209
210 /* port */
211 len = strcspn(line + i, "\t");
212 if (len + 1 < sizeof(v.port)) {
213 memcpy(v.port, line + i, len);
214 v.port[len] = '\0';
215 } else {
216 /* too long port */
217 continue;
218 }
219
220 if (RB_FIND(vistree, &head, &v) == NULL)
221 crawl(v.server, v.port, v.path);
222 }
223 if (ferror(fp))
224 err(1, "fgets");
225 fclose(fp);
226
227 return 0;
228 }
229
230 int
231 main(void)
232 {
233 return crawl(config_server, config_port, "");
234 }