gopher-clustering2.c - brcon2024-hackathons - Bitreichcon 2024 Hackathons
(HTM) git clone git://bitreich.org/brcon2024-hackathons git://enlrupgkhuxnvlhsf6lc3fziv5h2hhfrinws65d7roiv6bfj7d652fid.onion/brcon2024-hackathons
(DIR) Log
(DIR) Files
(DIR) Refs
(DIR) Tags
(DIR) Submodules
---
gopher-clustering2.c (24998B)
---
1 #define _POSIX_C_SOURCE 200112L
2
3 #include <assert.h>
4 #include <stdarg.h>
5 #include <stdint.h>
6 #include <stdio.h>
7 #include <stdlib.h>
8 #include <string.h>
9
10 #include <math.h>
11
12 #include <netdb.h>
13 #include <sys/socket.h>
14 #include <sys/types.h>
15 #include <unistd.h>
16
17 uint32_t
18 fnv1a(int n,...)
19 {
20 int i;
21 char *s;
22 va_list l;
23 uint32_t h;
24
25 h = 0x811c9dc5;
26
27 va_start(l, n);
28 for (i = 0; i < n; i++) {
29 for (s = va_arg(l, char*); *s; s++) {
30 h ^= *s;
31 h *= 0x01000193;
32 }
33 }
34 va_end(l);
35
36 return h;
37 }
38
39 uint32_t
40 xorshift(uint32_t *s)
41 {
42 *s ^= *s << 13;
43 *s ^= *s >> 17;
44 *s ^= *s << 5;
45 return *s;
46 }
47
48 /*
49 https://github.com/skeeto/hash-prospector/issues/19#issuecomment-1120105785
50 */
51 uint32_t
52 mix(uint32_t x)
53 {
54 x ^= x >> 16;
55 x *= 0x21f0aaad;
56 x ^= x >> 15;
57 x *= 0x735a2d97;
58 x ^= x >> 16;
59 return x;
60 }
61
62 struct gopherentry {
63 struct gopherentry *next;
64 char type;
65 char *fields[4];
66 size_t i, s;
67 };
68
69 char *
70 gopher_fieldend(char *s)
71 {
72 for (; *s; s++) {
73 if (*s == '\t')
74 break;
75 if (*s == '\r' && *(s+1) == '\n')
76 break;
77 if (*s == '\n')
78 break;
79 }
80 return s;
81 }
82
83 char *
84 gopher_nextentry(char *s)
85 {
86 char *c;
87
88 if (c = strchr(s, '\n'))
89 return c + 1;
90 return NULL;
91 }
92
93 void *
94 xmalloc(size_t s)
95 {
96 void *p;
97
98 if (!(p = malloc(s))) {
99 perror(NULL);
100 exit(1);
101 }
102 return p;
103 }
104
105 void *
106 xrealloc(void *p, size_t s)
107 {
108 if (!(p = realloc(p, s))) {
109 perror(NULL);
110 exit(1);
111 }
112 return p;
113 }
114
115 /* Allow a lot of ugly things... */
116 size_t
117 gopher_parsemenu(char *d, struct gopherentry **g)
118 {
119 ssize_t gl;
120 size_t fl, i, s;
121 char *c, *p, pt;
122 struct gopherentry *cg;
123
124 gl = 0;
125 cg = NULL;
126
127 s = 0;
128 pt = 0;
129 for (c = d; c && *c; c = gopher_nextentry(c)) {
130 if (*c == '.')
131 break;
132 if (*c == '\n')
133 continue;
134
135 gl++;
136 cg = xrealloc(cg, gl * sizeof(struct gopherentry));
137 memset(&cg[gl-1], 0, sizeof(struct gopherentry));
138
139 if (*c != 'i' && pt == 'i')
140 s++;
141 pt = *c;
142
143 cg[gl-1].type = *c;
144 cg[gl-1].i = gl-1;
145 cg[gl-1].s = s;
146 c++;
147
148 for (i = 0; i < 4; i++) {
149 p = gopher_fieldend(c);
150 fl = p - c;
151 cg[gl-1].fields[i] = xmalloc(fl + 1);
152 memcpy(cg[gl-1].fields[i], c, fl);
153 cg[gl-1].fields[i][fl] = '\0';
154 if (*p != '\t')
155 break;
156 c = p + 1;
157 }
158 }
159 s++;
160 *g = cg;
161 return gl;
162 }
163
164 ssize_t
165 gopher_removeentries(struct gopherentry *g, size_t l)
166 {
167 size_t i, j;
168
169 for (i = 0; i < l;) {
170 if (g[i].type == 'i' || g[i].type == '3' || !g[i].fields[2] || !g[i].fields[3]) {
171 l--;
172 memmove(&g[i], &g[i+1], (l - i) * sizeof(*g));
173 continue;
174 }
175 for (j = 0; j < i; j++) {
176 if (!strcmp(g[i].fields[1], g[j].fields[1]) &&
177 !strcmp(g[i].fields[2], g[j].fields[2]) &&
178 !strcmp(g[i].fields[3], g[j].fields[3]))
179 break;
180 }
181 if (j < i) {
182 l--;
183 memmove(&g[i], &g[i+1], (l - i) * sizeof(*g));
184 continue;
185 }
186 i++;
187 }
188
189 return l;
190 }
191
192 ssize_t
193 gopher_request(char *host, char *port, char *selector, char *buffer, size_t l)
194 {
195 int fd;
196 struct addrinfo hints;
197 struct addrinfo *r, *rp;
198 ssize_t n, result, rn;
199
200 memset(&hints, 0, sizeof(hints));
201 hints.ai_family = AF_UNSPEC;
202 hints.ai_socktype = SOCK_STREAM;
203
204 if (getaddrinfo(host, port, &hints, &r) != 0)
205 return -1;
206
207 for (rp = r; rp; rp = rp->ai_next) {
208 if ((fd = socket(rp->ai_family, rp->ai_socktype, rp->ai_protocol)) == -1)
209 continue;
210 if (connect(fd, rp->ai_addr, rp->ai_addrlen) != -1)
211 break;
212 close(fd);
213 }
214 freeaddrinfo(r);
215 if (!rp)
216 return -1;
217
218 result = -1;
219 if (write(fd, selector, strlen(selector)) != strlen(selector))
220 goto cleanup;
221
222 if (write(fd, "\r\n", 2) != 2)
223 goto cleanup;
224
225 n = 0;
226 while (1) {
227 if (n + 512 > l - 1)
228 goto cleanup;
229 if ((rn = read(fd, &buffer[n], 512)) == -1)
230 goto cleanup;
231 else if (!rn)
232 break;
233 n += rn;
234 }
235
236 buffer[n] = '\0';
237 result = n;
238
239 cleanup:
240 close(fd);
241
242 return result;
243 }
244
245 size_t
246 triangularnumber(size_t n)
247 {
248 return (n * n + n) / 2;
249 }
250
251 size_t
252 distanceindex(size_t i, size_t j)
253 {
254 size_t t;
255
256 if (i < j) {
257 t = i;
258 i = j;
259 j = t;
260 }
261
262 return triangularnumber(i) + j;
263 }
264
265 struct point {
266 double dims[2];
267 };
268
269 #define SAMMON_MAPPING_MAX_ITER (1 << 13)
270 #define SAMMON_MAPPING_LR (0.01)
271 void
272 sammon(double *distances, size_t l, uint32_t prng, struct point *points)
273 {
274 size_t i, j, k, t, tmax;
275 double *gnarf;
276 double c, lr;
277 double e1, e2, sum, lasterror;
278 struct point *newpoints;
279
280 gnarf = calloc(triangularnumber(l), sizeof(*gnarf));
281 newpoints = calloc(l, sizeof(*newpoints));
282
283 for (i = 0; i < l; i++) {
284 points[i].dims[0] = (double)xorshift(&prng) / UINT32_MAX - 0.5;
285 points[i].dims[1] = (double)xorshift(&prng) / UINT32_MAX - 0.5;
286 }
287
288 c = 0;
289 for (i = 0; i < l; i++) {
290 gnarf[distanceindex(i, i)] = 0;
291 for (j = 0; j < i; j++) {
292 gnarf[distanceindex(i, j)] = sqrt(pow(points[i].dims[0] - points[j].dims[0], 2) + pow(points[i].dims[1] - points[j].dims[1], 2));
293 c += distances[distanceindex(i, j)];
294 }
295 }
296
297 lasterror = 0;
298 for (t = 0, tmax = SAMMON_MAPPING_MAX_ITER; t < tmax; t++) {
299 lr = SAMMON_MAPPING_LR;
300 for (i = 0; i < l; i++) {
301 for (j = 0; j < 2; j++) {
302 e1 = -2. / c;
303 sum = 0;
304 for (k = 0; k < l; k++) {
305 if (i == k)
306 continue;
307 sum += (distances[distanceindex(i, k)] - gnarf[distanceindex(i, k)]) / (distances[distanceindex(i, k)] * gnarf[distanceindex(i, k)]) * (points[i].dims[j] - points[k].dims[j]);
308 }
309 e1 *= sum;
310
311 e2 = -2. / c;
312 sum = 0;
313 for (k = 0; k < l; k++) {
314 if (i == k)
315 continue;
316 sum += 1. / (distances[distanceindex(i, k)] * gnarf[distanceindex(i, k)]) * ((distances[distanceindex(i, k)] - gnarf[distanceindex(i, k)]) - pow(points[i].dims[j] - points[k].dims[j], 2) / gnarf[distanceindex(i, k)] * (1 + (distances[distanceindex(i, k)] - gnarf[distanceindex(i, k)]) / gnarf[distanceindex(i, k)]));
317 }
318 e2 *= sum;
319
320 newpoints[i].dims[j] = points[i].dims[j] - lr * (e1 / fabs(e2));
321 }
322 }
323
324 sum = 0;
325 for (i = 0; i < l; i++) {
326 for (j = 0; j < i; j++) {
327 gnarf[distanceindex(i, j)] = sqrt(pow(newpoints[i].dims[0] - newpoints[j].dims[0], 2) + pow(newpoints[i].dims[1] - newpoints[j].dims[1], 2));
328 sum += pow(distances[distanceindex(i, j)] - gnarf[distanceindex(i, j)], 2) / distances[distanceindex(i, j)];
329 }
330 }
331
332 fprintf(stderr, "sammon mapping error: %lf\n", sum / c);
333 if (t > 0 && fabs(sum / c - lasterror) < 0.000000000001)
334 break;
335 lasterror = sum / c;
336
337 memcpy(points, newpoints, l * sizeof(*points));
338 }
339
340 free(newpoints);
341 free(gnarf);
342 }
343
344 int
345 fastermsc(struct gopherentry *g, size_t l, size_t k, double *distances, size_t *assoc, uint32_t prng)
346 {
347 size_t i, j, n, ck, grimmel;
348 ssize_t m1, m, m2;
349 size_t ti, th, huh;
350 double dsum, d, d2, gnarf, tdsum;
351 struct {
352 size_t medoid;
353 size_t n;
354 double dsum;
355 } *clusters;
356 struct {
357 size_t n1, n2, n3;
358 double d1, d2, d3;
359 } *cache;
360 double *dS, *dS2;
361 double cdS, cdS2;
362 size_t cm, cx;
363 ssize_t lx;
364 int flag, result;
365 uint32_t r;
366 double *C;
367
368 if (!l)
369 return 0;
370
371 if (l < k)
372 k = l;
373
374 clusters = calloc(k, sizeof(*clusters));
375 if (!clusters)
376 return -1;
377
378 result = -1;
379
380 /*
381 Seed the medoids randomly
382 */
383 for (i = n = 0; i < l; i++) {
384 if ((l - i) * (xorshift(&prng) / (double)UINT32_MAX) < k - n)
385 clusters[n++].medoid = i;
386 if (n == k)
387 break;
388 }
389
390 cache = calloc(l, sizeof(*cache));
391 if (!cache)
392 goto cleanup;
393
394 dS = calloc(k, sizeof(*dS));
395 if (!dS)
396 goto cleanup;
397
398 dS2 = calloc(k, sizeof(*dS2));
399 if (!dS2)
400 goto cleanup;
401
402 for (i = 0; i < l; i++) {
403 cache[i].d1 = cache[i].d2 = cache[i].d3 = INFINITY;
404 for (j = 0; j < k; j++) {
405 d = distances[distanceindex(i, clusters[j].medoid)];
406 if (d < cache[i].d1) {
407 cache[i].d3 = cache[i].d2;
408 cache[i].d2 = cache[i].d1;
409 cache[i].d1 = d;
410 cache[i].n3 = cache[i].n2;
411 cache[i].n2 = cache[i].n1;
412 cache[i].n1 = j;
413 } else if (d < cache[i].d2) {
414 cache[i].d3 = cache[i].d2;
415 cache[i].d2 = d;
416 cache[i].n3 = cache[i].n2;
417 cache[i].n2 = j;
418 } else if (d < cache[i].d3) {
419 cache[i].d3 = d;
420 cache[i].n3 = j;
421 }
422 }
423 }
424
425 for (i = 0; i < k; i++) {
426 dS[i] = 0;
427 for (j = 0; j < l; j++) {
428 if (cache[j].n1 != i)
429 continue;
430 dS[i] += cache[j].d1 / cache[j].d2 - cache[j].d2 / cache[j].d3;
431 }
432 for (j = 0; j < l; j++) {
433 if (cache[j].n2 != i)
434 continue;
435 dS[i] += cache[j].d1 / cache[j].d2 - cache[j].d1 / cache[j].d3;
436 }
437 }
438
439 lx = -1;
440 for (grimmel = 0;grimmel < 32;grimmel++) {
441 cdS = 0;
442 for (i = 0; i < l; i++) {
443 if (i == lx)
444 goto fastermsc_finished;
445
446 for (j = 0; j < k; j++) {
447 if (clusters[j].medoid == i)
448 goto fastmsc_nexti;
449 }
450
451 memcpy(dS2, dS, k * sizeof(*dS2));
452 cdS2 = 0;
453 for (j = 0; j < l; j++) {
454 d = distances[distanceindex(i, j)];
455 if (d < cache[j].d1) {
456 cdS2 += cache[j].d1 / cache[j].d2 - d / cache[j].d1;
457 dS2[cache[j].n1] += d / cache[j].d1 + cache[j].d2 / cache[j].d3 - (cache[j].d1 + d) / cache[j].d2;
458 dS2[cache[j].n2] += cache[j].d1 / cache[j].d3 - cache[j].d1 / cache[j].d2;
459 } else if (d < cache[j].d2) {
460 cdS2 += cache[j].d1 / cache[j].d2 - cache[j].d1 / d;
461 dS2[cache[j].n1] += cache[j].d1 / d + cache[j].d2 / cache[j].d3 - (cache[j].d1 + d) / cache[j].d2;
462 dS2[cache[j].n2] += cache[j].d1 / cache[j].d3 - cache[j].d1 / cache[j].d2;
463 } else if (d < cache[j].d3) {
464 dS2[cache[j].n1] += cache[j].d2 / cache[j].d3 - cache[j].d2 / d;
465 dS2[cache[j].n2] += cache[j].d1 / cache[j].d3 - cache[j].d1 / d;
466 }
467 }
468 d = -INFINITY;
469 for (j = 0; j < k; j++) {
470 if (dS2[j] > d) {
471 d = dS2[j];
472 m = j;
473 }
474 }
475 dS2[m] += cdS2;
476 if (dS2[m] <= 0)
477 continue;
478 lx = i;
479 clusters[m].medoid = i;
480 for (j = 0; j < l; j++) {
481 if (cache[j].n1 != m && cache[j].n2 != m && cache[j].n3 != m)
482 continue;
483 cache[j].d1 = cache[j].d2 = cache[j].d3 = INFINITY;
484 for (n = 0; n < k; n++) {
485 d = distances[distanceindex(j, clusters[n].medoid)];
486 if (d < cache[j].d1) {
487 cache[j].d3 = cache[j].d2;
488 cache[j].d2 = cache[j].d1;
489 cache[j].d1 = d;
490 cache[j].n3 = cache[j].n2;
491 cache[j].n2 = cache[j].n1;
492 cache[j].n1 = n;
493 } else if (d < cache[j].d2) {
494 cache[j].d3 = cache[j].d2;
495 cache[j].d2 = d;
496 cache[j].n3 = cache[j].n2;
497 cache[j].n2 = n;
498 } else if (d < cache[j].d3) {
499 cache[j].d3 = d;
500 cache[j].n3 = n;
501 }
502 }
503 }
504 for (j = 0; j < k; j++) {
505 dS[j] = 0;
506 for (n = 0; n < l; n++) {
507 if (cache[n].n1 != j)
508 continue;
509 dS[j] += cache[n].d1 / cache[n].d2 - cache[n].d2 / cache[n].d3;
510 }
511 for (n = 0; n < l; n++) {
512 if (cache[n].n2 != j)
513 continue;
514 dS[j] += cache[n].d1 / cache[n].d2 - cache[n].d1 / cache[n].d3;
515 }
516 }
517 fastmsc_nexti:;
518 }
519 /*
520 if (lx == -1)
521 break;
522 */
523 }
524 fastermsc_finished:;
525
526 for (i = 0; i < k; i++)
527 clusters[i].n = 0;
528
529 for (i = 0; i < l; i++) {
530 d = INFINITY;
531 for (j = 0; j < k; j++) {
532 if (distances[distanceindex(clusters[j].medoid, i)] < d) {
533 assoc[i] = j;
534 d = distances[distanceindex(clusters[j].medoid, i)];
535 }
536 }
537 clusters[assoc[i]].n++;
538 }
539
540 for (i = 0; i < l; i++) {
541 if (clusters[assoc[i]].medoid == i)
542 assoc[i] |= 128;
543 }
544
545 result = k;
546 cleanup:
547 free(clusters);
548 return result;
549 }
550
551 int
552 fastmsc(struct gopherentry *g, size_t l, size_t k, double *distances, size_t *assoc, uint32_t prng)
553 {
554 size_t i, j, n, ck;
555 ssize_t m1, m, m2;
556 size_t ti, th;
557 double dsum, d, d2, gnarf, tdsum;
558 struct {
559 size_t medoid;
560 size_t n;
561 double dsum;
562 } *clusters;
563 struct {
564 size_t n1, n2;
565 double d1, d2, d3;
566 } *cache;
567 double *dS, *dS2;
568 double cdS, cdS2;
569 double _e, _y, _z, _s;
570 size_t cm, cx;
571 int flag, result;
572 uint32_t r;
573 double *C;
574
575 if (!l)
576 return 0;
577
578 if (l < k)
579 k = l;
580
581 clusters = calloc(k, sizeof(*clusters));
582 if (!clusters)
583 return -1;
584
585 result = -1;
586
587 /*
588 seed the medoids using PAM BUILD
589 d = INFINITY;
590 m = -1;
591 for (i = 0; i < l; i++) {
592 dsum = 0;
593 for (j = 0; j < l; j++)
594 dsum += distances[distanceindex(i, j)];
595 if (dsum < d) {
596 m = i;
597 d = dsum;
598 }
599 }
600 if (m == -1)
601 goto cleanup;
602 ck = 0;
603 clusters[ck].medoid = m;
604 ck++;
605
606 C = xmalloc(l * l * sizeof(*C));
607
608 for (; ck < k; ck++) {
609 memset(C, 0, l * l * sizeof(*C));
610 for (i = 0; i < l; i++) {
611 for (j = 0; j < ck; j++) {
612 if (clusters[j].medoid == i)
613 goto build_nexti;
614 }
615 for (j = 0; j < l; j++) {
616 if (i == j)
617 continue;
618 for (n = 0; n < ck; n++) {
619 if (clusters[n].medoid == j)
620 goto build_nextj;
621 }
622 d = INFINITY;
623 for (n = 0; n < ck; n++) {
624 if (distances[distanceindex(j, clusters[n].medoid)] < d)
625 d = distances[distanceindex(j, clusters[n].medoid)];
626 }
627 C[j * l + i] = d - distances[distanceindex(j, i)];
628 if (C[j * l + i] < 0)
629 C[j * l + i] = 0;
630 build_nextj:;
631 }
632 build_nexti:;
633 }
634 d = -1;
635 m = -1;
636 for (i = 0; i < l; i++) {
637 for (j = 0; j < ck; j++) {
638 if (clusters[j].medoid == i)
639 goto build_nexti2;
640 }
641 dsum = 0;
642 for (j = 0; j < l; j++)
643 dsum += C[j * l + i];
644 if (dsum > d) {
645 d = dsum;
646 m = i;
647 }
648 build_nexti2:;
649 }
650 clusters[ck].medoid = m;
651 }
652 free(C);
653 */
654
655 /*
656 Seed the medoids randomly
657 */
658 for (i = n = 0; i < l; i++) {
659 if ((l - i) * (xorshift(&prng) / (double)UINT32_MAX) < k - n)
660 clusters[n++].medoid = i;
661 if (n == k)
662 break;
663 }
664
665 cache = calloc(l, sizeof(*cache));
666 if (!cache)
667 goto cleanup;
668
669 dS = calloc(k, sizeof(*dS));
670 if (!dS)
671 goto cleanup;
672
673 dS2 = calloc(k, sizeof(*dS2));
674 if (!dS2)
675 goto cleanup;
676
677 for (;;) {
678 for (i = 0; i < l; i++) {
679 cache[i].d1 = cache[i].d2 = cache[i].d3 = INFINITY;
680 for (j = 0; j < k; j++) {
681 d = distances[distanceindex(i, clusters[j].medoid)];
682 if (d < cache[i].d1) {
683 cache[i].d3 = cache[i].d2;
684 cache[i].d2 = cache[i].d1;
685 cache[i].d1 = d;
686 cache[i].n2 = cache[i].n1;
687 cache[i].n1 = j;
688 } else if (d < cache[i].d2) {
689 cache[i].d3 = cache[i].d2;
690 cache[i].d2 = d;
691 cache[i].n2 = j;
692 } else if (d < cache[i].d3) {
693 cache[i].d3 = d;
694 }
695 }
696 }
697
698 for (i = 0; i < k; i++) {
699 dS[i] = 0;
700 for (j = 0; j < l; j++) {
701 if (cache[j].n1 != i)
702 continue;
703 dS[i] += cache[j].d1 / cache[j].d2 - cache[j].d2 / cache[j].d3;
704 }
705 for (j = 0; j < l; j++) {
706 if (cache[j].n2 != i)
707 continue;
708 dS[i] += cache[j].d1 / cache[j].d2 - cache[j].d1 / cache[j].d3;
709 }
710 }
711
712 cdS = 0;
713 for (i = 0; i < l; i++) {
714 for (j = 0; j < k; j++) {
715 if (clusters[j].medoid == i)
716 goto fastmsc_nexti;
717 }
718 memcpy(dS2, dS, k * sizeof(*dS2));
719 cdS2 = _e = 0;
720 for (j = 0; j < l; j++) {
721 d = distances[distanceindex(i, j)];
722 if (d < cache[j].d1) {
723 cdS2 += cache[j].d1 / cache[j].d2 - d / cache[j].d1;
724 dS2[cache[j].n1] += d / cache[j].d1 + cache[j].d2 / cache[j].d3 - (cache[j].d1 + d) / cache[j].d2;
725 dS2[cache[j].n2] += cache[j].d1 / cache[j].d3 - cache[j].d1 / cache[j].d2;
726 } else if (d < cache[j].d2) {
727 cdS2 += cache[j].d1 / cache[j].d2 - cache[j].d1 / d;
728 dS2[cache[j].n1] += cache[j].d1 / d + cache[j].d2 / cache[j].d3 - (cache[j].d1 + d) / cache[j].d2;
729 dS2[cache[j].n2] += cache[j].d1 / cache[j].d3 - cache[j].d1 / cache[j].d2;
730 } else if (d < cache[j].d3) {
731 dS2[cache[j].n1] += cache[j].d2 / cache[j].d3 - cache[j].d2 / d;
732 dS2[cache[j].n2] += cache[j].d1 / cache[j].d3 - cache[j].d1 / d;
733 }
734 }
735 d = -INFINITY;
736 d2 = INFINITY;
737 for (j = 0; j < k; j++) {
738 if (dS2[j] > d) {
739 d = dS2[j];
740 m = j;
741 }
742 if (dS2[j] < d2) {
743 d2 = dS2[j];
744 m2 = j;
745 }
746 }
747 dS2[m] += cdS2;
748 if (dS2[m] > cdS) {
749 cdS = dS2[m];
750 cm = m;
751 cx = i;
752 }
753 fastmsc_nexti:;
754 }
755 /* Should be 0. I guess some floating point problems? */
756 if (cdS <= 0.00000000000001)
757 break;
758 clusters[cm].medoid = cx;
759 }
760
761 for (i = 0; i < k; i++)
762 clusters[i].n = 0;
763
764 for (i = 0; i < l; i++) {
765 d = INFINITY;
766 for (j = 0; j < k; j++) {
767 if (distances[distanceindex(clusters[j].medoid, i)] < d) {
768 assoc[i] = j;
769 d = distances[distanceindex(clusters[j].medoid, i)];
770 }
771 }
772 clusters[assoc[i]].n++;
773 }
774
775 for (i = 0; i < l; i++) {
776 if (clusters[assoc[i]].medoid == i)
777 assoc[i] |= 128;
778 }
779
780 result = k;
781 cleanup:
782 free(clusters);
783 return result;
784 }
785
786 /*
787 Seeding of medoids based on the PAM BUILD description in https://www.cs.umb.edu/cs738/pam1.pdf
788 I was too stupid to understand other descriptions.
789 */
790 int
791 kmedoids(struct gopherentry *g, size_t l, size_t k, double *distances, size_t *assoc, uint32_t prng)
792 {
793 size_t i, j, n, ck;
794 ssize_t m1, m;
795 size_t ti, th;
796 double dsum, d, d2, gnarf, tdsum;
797 struct {
798 size_t medoid;
799 size_t n;
800 double dsum;
801 } *clusters;
802 int flag, result;
803 uint32_t r;
804 double *C;
805
806 if (!l)
807 return 0;
808
809 if (l < k)
810 k = l;
811
812 clusters = calloc(k, sizeof(*clusters));
813 if (!clusters)
814 return -1;
815
816 result = -1;
817
818 /* seed the medoids using PAM BUILD */
819 d = INFINITY;
820 m = -1;
821 for (i = 0; i < l; i++) {
822 dsum = 0;
823 for (j = 0; j < l; j++)
824 dsum += distances[distanceindex(i, j)];
825 if (dsum < d) {
826 m = i;
827 d = dsum;
828 }
829 }
830 if (m == -1)
831 goto cleanup;
832 ck = 0;
833 clusters[ck].medoid = m;
834 ck++;
835
836 C = xmalloc(l * l * sizeof(*C));
837
838 for (; ck < k; ck++) {
839 memset(C, 0, l * l * sizeof(*C));
840 for (i = 0; i < l; i++) {
841 for (j = 0; j < ck; j++) {
842 if (clusters[j].medoid == i)
843 goto build_nexti;
844 }
845 for (j = 0; j < l; j++) {
846 if (i == j)
847 continue;
848 for (n = 0; n < ck; n++) {
849 if (clusters[n].medoid == j)
850 goto build_nextj;
851 }
852 d = INFINITY;
853 for (n = 0; n < ck; n++) {
854 if (distances[distanceindex(j, clusters[n].medoid)] < d)
855 d = distances[distanceindex(j, clusters[n].medoid)];
856 }
857 C[j * l + i] = d - distances[distanceindex(j, i)];
858 if (C[j * l + i] < 0)
859 C[j * l + i] = 0;
860 build_nextj:;
861 }
862 build_nexti:;
863 }
864 d = -1;
865 m = -1;
866 for (i = 0; i < l; i++) {
867 for (j = 0; j < ck; j++) {
868 if (clusters[j].medoid == i)
869 goto build_nexti2;
870 }
871 dsum = 0;
872 for (j = 0; j < l; j++)
873 dsum += C[j * l + i];
874 if (dsum > d) {
875 d = dsum;
876 m = i;
877 }
878 build_nexti2:;
879 }
880 clusters[ck].medoid = m;
881 }
882 free(C);
883
884 for (;;) {
885 tdsum = INFINITY;
886 for (i = 0; i < k; i++) {
887 for (j = 0; j < l; j++) {
888 for (n = 0; n < k; n++) {
889 if (j == clusters[n].medoid)
890 goto swap_nextj;
891 }
892 dsum = 0;
893 for (n = 0; n < l; n++) {
894 if (n == j)
895 continue;
896 for (m = 0; m < k; m++) {
897 if (clusters[m].medoid == n)
898 goto swap_nextn;
899 }
900 d = INFINITY;
901 d2 = INFINITY;
902 for (m = 0; m < k; m++) {
903 if (distances[distanceindex(clusters[m].medoid, n)] < d) {
904 d2 = d;
905 d = distances[distanceindex(clusters[m].medoid, n)];
906 } else if (distances[distanceindex(clusters[m].medoid, n)] < d2) {
907 d2 = distances[distanceindex(clusters[m].medoid, n)];
908 }
909 }
910 if (distances[distanceindex(clusters[i].medoid, n)] > d) {
911 if (distances[distanceindex(j, n)] >= d)
912 gnarf = 0;
913 else
914 gnarf = distances[distanceindex(j, n)] - d;
915 } else if (distances[distanceindex(clusters[i].medoid, n)] == d) {
916 if (distances[distanceindex(j, n)] < d2)
917 gnarf = distances[distanceindex(j, n)] - d;
918 else
919 gnarf = d2 - d;
920 } else {
921 continue;
922 }
923 dsum += gnarf;
924 swap_nextn:;
925 }
926 if (dsum < tdsum) {
927 ti = i;
928 th = j;
929 tdsum = dsum;
930 }
931 swap_nextj:;
932 }
933 }
934 if (tdsum >= 0)
935 break;
936 clusters[ti].medoid = th;
937 }
938
939 for (i = 0; i < k; i++)
940 clusters[i].n = 0;
941
942 for (i = 0; i < l; i++) {
943 d = INFINITY;
944 for (j = 0; j < k; j++) {
945 if (distances[distanceindex(clusters[j].medoid, i)] < d) {
946 assoc[i] = j;
947 d = distances[distanceindex(clusters[j].medoid, i)];
948 }
949 }
950 clusters[assoc[i]].n++;
951 }
952
953 for (i = 0; i < l; i++) {
954 if (clusters[assoc[i]].medoid == i)
955 assoc[i] |= 128;
956 }
957
958 result = k;
959 cleanup:
960 free(clusters);
961 return result;
962 }
963
964 size_t
965 diff(size_t a, size_t b)
966 {
967 if (a < b)
968 return b - a;
969 return a - b;
970 }
971
972 /*
973 https://en.wikipedia.org/wiki/Levenshtein_distance#Iterative_with_two_matrix_rows
974 Which references https://www.codeproject.com/Articles/13525/Fast-memory-efficient-Levenshtein-algorithm-2
975 The code there is using the CPOL: https://www.codeproject.com/info/cpol10.aspx
976 */
977 size_t
978 levenshteindistance(char *a, char *b)
979 {
980 size_t i, j, dc, ic, sc, r;
981 size_t *v0, *v1, *t;
982
983 v0 = xmalloc((strlen(b) + 1) * sizeof(*v0));
984 v1 = xmalloc((strlen(b) + 1) * sizeof(*v1));
985
986 for (i = 0; i <= strlen(b); i++)
987 v0[i] = i;
988
989 for (i = 0; i < strlen(a); i++) {
990 v1[0] = i + 1;
991 for (j = 0; j < strlen(b); j++) {
992 dc = v0[j + 1] + 1;
993 ic = v1[j] + 1;
994 if (a[i] == b[j])
995 sc = v0[j];
996 else
997 sc = v0[j] + 1;
998 if (dc < ic && dc < sc)
999 v1[j + 1] = dc;
1000 else if (ic < dc && ic < sc)
1001 v1[j + 1] = ic;
1002 else
1003 v1[j + 1] = sc;
1004 }
1005 t = v0;
1006 v0 = v1;
1007 v1 = t;
1008 }
1009
1010 r = v0[strlen(b)];
1011
1012 free(v1);
1013 free(v0);
1014
1015 return r;
1016 }
1017
1018 /*
1019 Remove common prefix and suffix. The pazz0distance is the sum of the length of the leftover strings.
1020 */
1021 size_t
1022 pazz0distance(char *a, char *b)
1023 {
1024 size_t pl, sl;
1025 size_t al, bl;
1026
1027 al = strlen(a);
1028 bl = strlen(b);
1029
1030 for (pl = 0; pl < al && pl < bl && a[pl] == b[pl]; pl++);
1031 for (sl = 0; al - sl > pl && bl - sl > pl && a[al - 1 - sl] == b[bl - 1 - sl]; sl++);
1032
1033 return al + bl - 2 * (pl + sl);
1034 }
1035
1036 /*
1037 https://en.wikipedia.org/wiki/Levenshtein_distance#Recursive
1038 */
1039 size_t
1040 levenshteindistance_slow(char *a, char *b)
1041 {
1042 size_t k, l, m;
1043
1044 if (!*a)
1045 return strlen(b);
1046 else if (!*b)
1047 return strlen(a);
1048 else if (*a == *b)
1049 return levenshteindistance_slow(a + 1, b + 1);
1050
1051 k = levenshteindistance_slow(a + 1, b);
1052 l = levenshteindistance_slow(a, b + 1);
1053 m = levenshteindistance_slow(a + 1, b + 1);
1054
1055 if (k < l && k < m)
1056 return 1 + k;
1057 else if (l < k && l < m)
1058 return 1 + l;
1059 else
1060 return 1 + m;
1061 }
1062
1063 ssize_t
1064 clustering(char *host, char *port, char *selector, struct gopherentry *g, size_t l, size_t *assocs, size_t k)
1065 {
1066 size_t i, j;
1067 ssize_t r;
1068 double *distances, d;
1069 double *mdists;
1070 struct {
1071 double selector;
1072 double segment;
1073 double index;
1074 } *distanceparts, maxdistanceparts;
1075 uint32_t prng;
1076 struct point *points;
1077
1078 distanceparts = xmalloc(triangularnumber(l) * sizeof(*distanceparts));
1079
1080 /* So the sammon mapping can be happy. */
1081 prng = fnv1a(4, host, port, selector, "distancewiggle");
1082 memset(&maxdistanceparts, 0, sizeof(maxdistanceparts));
1083 for (i = 0; i < l; i++) {
1084 memset(&distanceparts[distanceindex(i, i)], 0, sizeof(*distanceparts));
1085 for (j = 0; j < i; j++) {
1086 distanceparts[distanceindex(i, j)].selector = levenshteindistance(g[i].fields[1], g[j].fields[1]) + levenshteindistance(g[i].fields[2], g[j].fields[2]) + levenshteindistance(g[i].fields[3], g[j].fields[3]) + ((double)xorshift(&prng) / UINT32_MAX) * 0.000000000000000000000000000000000000001;
1087 distanceparts[distanceindex(i, j)].segment = diff(g[i].s, g[j].s);
1088 distanceparts[distanceindex(i, j)].index = diff(g[i].i, g[j].i);
1089
1090 if (distanceparts[distanceindex(i, j)].selector > maxdistanceparts.selector)
1091 maxdistanceparts.selector = distanceparts[distanceindex(i, j)].selector;
1092 if (distanceparts[distanceindex(i, j)].segment > maxdistanceparts.segment)
1093 maxdistanceparts.segment = distanceparts[distanceindex(i, j)].segment;
1094 if (distanceparts[distanceindex(i, j)].index > maxdistanceparts.index)
1095 maxdistanceparts.index = distanceparts[distanceindex(i, j)].index;
1096 }
1097 }
1098 for (i = 0; i < l; i++) {
1099 for (j = 0; j < i; j++) {
1100 distanceparts[distanceindex(i, j)].selector /= maxdistanceparts.selector;
1101 distanceparts[distanceindex(i, j)].segment /= maxdistanceparts.segment;
1102 distanceparts[distanceindex(i, j)].index /= maxdistanceparts.index;
1103 }
1104 }
1105
1106 distances = xmalloc(triangularnumber(l) * sizeof(*distances));
1107 for (i = 0; i < l; i++) {
1108 distances[distanceindex(i, i)] = 0;
1109 for (j = 0; j < i; j++)
1110 distances[distanceindex(i, j)] = 0.75 * distanceparts[distanceindex(i, j)].selector + 0.125 * distanceparts[distanceindex(i, j)].segment + 0.125 * distanceparts[distanceindex(i, j)].index; //distanceparts[distanceindex(i, j)].selector;
1111 }
1112 free(distanceparts);
1113
1114 r = kmedoids(g, l, k, distances, assocs, fnv1a(4, host, port, selector, "seed"));
1115
1116 points = calloc(l, sizeof(*points));
1117 sammon(distances, l, fnv1a(4, host, port, selector, "sammonseed"), points);
1118
1119 for (i = 0; i < l; i++)
1120 printf("%lf\t%lf\t%ld\t%lu\t%s\n", points[i].dims[0], points[i].dims[1], assocs[i] & 127, assocs[i] >> 7, g[i].fields[1]);
1121
1122 free(points);
1123 free(distances);
1124
1125 return r;
1126 }
1127
1128 char gopherbuffer[16 * 1024 * 1024];
1129
1130 int
1131 main(int argc, char *argv[])
1132 {
1133 ssize_t l;
1134 size_t k;
1135 struct gopherentry *g;
1136 char *host, *port, *selector;
1137 size_t *cassocs;
1138
1139 if (argc < 2) {
1140 printf("%s host [port [selector]]\n", argv[0]);
1141 return 1;
1142 }
1143
1144 host = argv[1];
1145 port = (argc > 2) ? argv[2] : "70";
1146 selector = (argc > 3) ? argv[3] : "";
1147
1148 l = gopher_request(host, port, selector, gopherbuffer, sizeof(gopherbuffer));
1149 if (l == -1)
1150 return 1;
1151
1152 l = gopher_parsemenu(gopherbuffer, &g);
1153 if (l == -1)
1154 return 1;
1155
1156 l = gopher_removeentries(g, l);
1157
1158 cassocs = calloc(l, sizeof(*cassocs));
1159 if (!cassocs)
1160 return 1;
1161 k = 5;
1162 clustering(host, port, selector, g, l, cassocs, k);
1163
1164 return 0;
1165 }